In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats

# ----------------------------------------------
# 1. Load dataset
# ----------------------------------------------
df = pd.read_csv("C:\\Users\\kuzha\\Downloads\\processed_data (3).csv")

# ----------------------------------------------
# 2. Remove unnamed/index garbage columns
# ----------------------------------------------
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# ----------------------------------------------
# 3. Force all numeric columns to numeric
#    (non-convertible values become NaN)
# ----------------------------------------------
df = df.apply(pd.to_numeric, errors='ignore')

# Check dtypes
print(df.dtypes)

# ----------------------------------------------
# 4. Identify categorical columns
# ----------------------------------------------
cat_columns = df.select_dtypes(include=['object']).columns
print("Categorical columns:", cat_columns.tolist())

# If 'State' exists, dummy encode it
if 'State' in cat_columns:
    df = pd.get_dummies(df, columns=['State'], drop_first=True)

# ----------------------------------------------
# 5. Ensure all remaining features are numeric
# ----------------------------------------------
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# ----------------------------------------------
# 6. Drop rows with NaN after conversion
# ----------------------------------------------
df = df.dropna()

# ----------------------------------------------
# 7. Define X and y
# ----------------------------------------------
y = df["Profit"]

X = df.drop(columns=["Profit"])

# Final safety check
print("Final X dtypes:\n", X.dtypes)

# ----------------------------------------------
# 8. Train/Test Split
# ----------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------------
# 9. OLS model
# ----------------------------------------------
X_train_sm = sm.add_constant(X_train.astype(float))
X_test_sm = sm.add_constant(X_test.astype(float))

model = sm.OLS(y_train, X_train_sm).fit()
print(model.summary())

# ----------------------------------------------
# 10. Evaluation
# ----------------------------------------------
y_pred = model.predict(X_test_sm)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nRMSE:", rmse)
print("R²:", r2)

# ----------------------------------------------
# 11. Feature importance
# ----------------------------------------------
coef_table = (
    pd.DataFrame({
        "Feature": model.params.index,
        "Coefficient": model.params.values,
        "AbsCoeff": np.abs(model.params.values)
    })
    .sort_values("AbsCoeff", ascending=False)
)

print("\nFeature Importance:\n", coef_table)

# ----------------------------------------------
# 12. ANOVA: State vs Profit (only if State exists)
# ----------------------------------------------
if "State" in df.columns:
    f_stat, p_val = stats.f_oneway(
        *[group["Profit"].values for _, group in df.groupby("State")]
    )
    print("\nANOVA State vs Profit:")
    print("F-stat:", f_stat)
    print("p-value:", p_val)


R&D_Spend                 float64
Administration            float64
Marketing_Spend           float64
Profit                    float64
Total_Spend               float64
ROI                       float64
State_California             bool
State_Florida                bool
State_New York               bool
R&D_Spend_scaled          float64
Administration_scaled     float64
Marketing_Spend_scaled    float64
Total_Spend_scaled        float64
ROI_scaled                float64
dtype: object
Categorical columns: []
Final X dtypes:
 R&D_Spend                 float64
Administration            float64
Marketing_Spend           float64
Total_Spend               float64
ROI                       float64
State_California             bool
State_Florida                bool
State_New York               bool
R&D_Spend_scaled          float64
Administration_scaled     float64
Marketing_Spend_scaled    float64
Total_Spend_scaled        float64
ROI_scaled                float64
dtype: object
             

  df = df.apply(pd.to_numeric, errors='ignore')
