In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib

# Load dataset
df = pd.read_csv("Cleaned_IPL_Dataset2.csv")

# Selected features
features = [
    "Age Group", "Gender", "Favorite IPL Team:", "Who is your favorite player in IPL?",
    "How long have you supported this team?", "Have you ever argued with someone online about IPL?",
    "Have you ever attended an IPL match in a stadium?"
]

target = "How emotionally invested are you in IPL? (Scale: 1 - Not at all, 5 - Very emotionally invested)"

# Drop missing values
df_model = df[features + [target]].dropna()

# Encode categorical variables
encoders = {}
for col in features:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    encoders[col] = le

# Prepare X and y
X = df_model[features]
y = df_model[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
print("Model Performance:")
print(classification_report(y_test, model.predict(X_test)))

# Save model and encoders
joblib.dump(model, "investment_model.pkl")
joblib.dump(encoders, "label_encoders.pkl")
print("✅ Model and encoders saved!")

Model Performance:
              precision    recall  f1-score   support

           1       0.83      1.00      0.91        15
           2       1.00      1.00      1.00         1
           3       0.72      1.00      0.84        18
           4       1.00      0.68      0.81        22
           5       1.00      0.88      0.93        24

    accuracy                           0.88        80
   macro avg       0.91      0.91      0.90        80
weighted avg       0.91      0.88      0.87        80

✅ Model and encoders saved!
