In [218]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [220]:
data=pd.read_csv("/Users/ishaankadle/Downloads/train.csv")
data_test=pd.read_csv("/Users/ishaankadle/Downloads/test.csv")

df=data.copy()
df_test=data_test.copy()

In [221]:
#Converting the Product ID into numbers that can be used.
df['Product_ID_usable']=df['Product_ID'].str.replace('P', '').astype(int)
df_test['Product_ID_usable']=df_test['Product_ID'].str.replace('P', '').astype(int)

In [222]:
pd.get_dummies(df.Gender,drop_first=True).astype(int)
pd.get_dummies(df_test.Gender,drop_first=True).astype(int)
df["Gender_Encoded"]=pd.get_dummies(df.Gender,drop_first=True).astype(int)
df_test["Gender_Encoded"]=pd.get_dummies(df_test.Gender,drop_first=True).astype(int)

In [224]:
city_encoded=pd.get_dummies(df['City_Category'], drop_first=True).astype(int)
city_encoded_test=pd.get_dummies(df_test['City_Category'], drop_first=True).astype(int)

df = pd.concat([df, city_encoded], axis=1)
df_test = pd.concat([df_test, city_encoded_test], axis=1)

In [227]:
df=df.drop(columns=['Gender','City_Category'])
df_test=df_test.drop(columns=['Gender','City_Category'])

In [230]:
age_encoder_1={
    '0-17': 8.5,
    '18-25': 21.5,
    '26-35': 30.5,
    '36-45': 40.5,
    '46-50': 48,
    '51-55': 53,
    '55+': 55
}

df['Age'] = df['Age'].map(age_encoder_1)
df_test['Age'] = df_test['Age'].map(age_encoder_1)

In [232]:
stay_encoder={
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4+': 4
}
df['Stay'] = df['Stay_In_Current_City_Years'].map(stay_encoder)
df_test['Stay'] = df_test['Stay_In_Current_City_Years'].map(stay_encoder)

In [234]:
df=df.drop(columns=['Stay_In_Current_City_Years'])
df_test=df_test.drop(columns=['Stay_In_Current_City_Years'])

In [236]:
df.columns

Index(['User_ID', 'Product_ID', 'Age', 'Occupation', 'Marital_Status',
       'Product_Category_1', 'Product_Category_2', 'Product_Category_3',
       'Purchase', 'Product_ID_usable', 'Gender_Encoded', 'B', 'C', 'Stay'],
      dtype='object')

In [238]:
df.dtypes

User_ID                 int64
Product_ID             object
Age                   float64
Occupation              int64
Marital_Status          int64
Product_Category_1      int64
Product_Category_2    float64
Product_Category_3    float64
Purchase                int64
Product_ID_usable       int64
Gender_Encoded          int64
B                       int64
C                       int64
Stay                    int64
dtype: object

In [240]:
df['Product_Category_2'].fillna(df.Product_Category_2.mean(),inplace=True)
df_test['Product_Category_2'].fillna(df_test.Product_Category_2.mean(),inplace=True)

df['Product_Category_3'].fillna(0,inplace=True)
df_test['Product_Category_3'].fillna(0,inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Product_Category_2'].fillna(df.Product_Category_2.mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Product_Category_2'].fillna(df_test.Product_Category_2.mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never w

In [243]:
df.columns

Index(['User_ID', 'Product_ID', 'Age', 'Occupation', 'Marital_Status',
       'Product_Category_1', 'Product_Category_2', 'Product_Category_3',
       'Purchase', 'Product_ID_usable', 'Gender_Encoded', 'B', 'C', 'Stay'],
      dtype='object')

In [245]:
X_train = df[['Product_ID_usable','User_ID','Age', 'Occupation', 'Marital_Status',
       'Gender_Encoded', 'B', 'C', 'Stay', 'Product_Category_1',
             'Product_Category_2','Product_Category_3']]

In [247]:
Y_train = df['Purchase'] 

In [249]:
X_test = df_test[['Product_ID_usable','User_ID','Age', 'Occupation', 'Marital_Status',
       'Gender_Encoded', 'B', 'C', 'Stay', 'Product_Category_1',
             'Product_Category_2','Product_Category_3',]]

In [251]:
import numpy as np
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    StackingRegressor,
    RandomForestRegressor
)
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV


In [259]:
rf_model = RandomForestRegressor() 
hgb_model = HistGradientBoostingRegressor(learning_rate= 0.1,max_depth=10, max_iter= 1000, min_samples_leaf= 5) 
lgbm_model = LGBMRegressor(learning_rate= 0.1, max_depth= 20, n_estimators= 500, num_leaves= 100)
xgb_model = XGBRegressor(colsample_bytree= 0.6, gamma= 0, learning_rate= 0.1, max_depth= 10, min_child_weight= 7, n_estimators= 400, subsample= 1.0) 
catboost_model = CatBoostRegressor()  

In [257]:
param_grid_xgb ={
    'max_depth': [3,5,10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 400],
    'min_child_weight': [3, 7],
    'gamma': [0, 0.3],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.6, 1.0]
}

grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, 
                               cv=5, scoring='neg_mean_squared_error', 
                               verbose=2, n_jobs=-1)

grid_search_xgb.fit(X_train, Y_train)

print("Best parameters for XGBRegressor: ", grid_search_xgb.best_params_)


Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best parameters for XGBRegressor:  {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 7, 'n_estimators': 400, 'subsample': 1.0}


In [265]:
stacking_model = StackingRegressor(
    estimators=[
        ('rf', rf_model),  
        ('hgb', hgb_model),
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('catboost', catboost_model)
    ],
    final_estimator=LinearRegression()
)

stacking_model.fit(X_train, Y_train)
df_test['Purchase'] = stacking_model.predict(X_test)  

df_test['Purchase'] = np.round(df_test['Purchase'])
#As purchase cant be negative
df_test["Purchase"] = np.where(df_test["Purchase"] < 0, 0, df_test["Purchase"])

[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=3, n_estimators=100, subsample=0.6; total time=   3.1s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=3, n_estimators=400, subsample=0.6; total time=   9.8s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=7, n_estimators=100, subsample=0.6; total time=   3.0s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=7, n_estimators=100, subsample=1.0; total time=   2.9s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=7, n_estimators=400, subsample=0.6; total time=   9.9s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=100, subsample=0.6; total time=   4.2s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=400, subsample=0.6; tota

In [266]:
df_test[['User_ID', 'Product_ID', 'Purchase']].to_csv('/Users/ishaankadle/Downloads/solutionabk.csv', index=False)