In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import pickle

# Load and clean data
df = pd.read_csv("train.csv")
df = df[df["id"] < 50000]

# Handle missing values
for col in df.select_dtypes(include="object").columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

for col in df.select_dtypes(include=["float64", "int64"]).columns:
    df[col].fillna(df[col].mean(), inplace=True)

df.set_index("id", inplace=True)
df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])
df["Start_Date"] = df["Policy Start Date"].dt.date
df.drop(columns="Policy Start Date", inplace=True)

# Log transform target
df = df[df['Premium Amount'] > 0]
df['Premium_Log'] = np.log1p(df['Premium Amount'])
df.dropna(subset=['Premium_Log'], inplace=True)

# Split features and target
X = df.drop(['Premium Amount', 'Premium_Log'], axis=1)
y = df['Premium_Log']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# Pipeline
model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# Fit model
model.fit(X_train, y_train)

# Evaluate
print("🔹 Train R²:", r2_score(y_train, model.predict(X_train)))
print("🔸 Test R²:", r2_score(y_test, model.predict(X_test)))

# Feature importance
def get_feature_names(ct):
    output = []
    for name, trans, cols in ct.transformers_:
        if hasattr(trans, 'get_feature_names_out'):
            output += list(trans.get_feature_names_out(cols))
        else:
            output += cols
    return output

feature_names = get_feature_names(preprocessor)
importances = model.named_steps['regressor'].feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(15)

# Plot top 15 features
plt.figure(figsize=(10, 5))
feat_imp.plot(kind='barh')
plt.title("Top 15 Feature Importances")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Save model + features together
save_object = {
    'model': model,
    'features': X.columns.tolist()
}
with open('model_and_features.pkl', 'wb') as f:
    pickle.dump(save_object, f)

print("✅ Model and feature list saved to model_and_features.pkl")
