#### Importing Libraries and Datasets

In [1]:
#importing necessary libraries to handle and visualize data
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#for building, training and validating the model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
#import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

#### EDA and Pre-Processing

In [4]:
df_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [6]:
df_train.nunique()

User_ID                        5891
Product_ID                     3631
Gender                            2
Age                               7
Occupation                       21
City_Category                     3
Stay_In_Current_City_Years        5
Marital_Status                    2
Product_Category_1               20
Product_Category_2               17
Product_Category_3               15
Purchase                      18105
dtype: int64

In [7]:
#First we will fill NA values for Product_Category_2 and Product_category_3
df_train.Product_Category_2.value_counts()

8.0     64088
14.0    55108
2.0     49217
16.0    43255
15.0    37855
5.0     26235
4.0     25677
6.0     16466
11.0    14134
17.0    13320
13.0    10531
9.0      5693
12.0     5528
10.0     3043
3.0      2884
18.0     2770
7.0       626
Name: Product_Category_2, dtype: int64

In [8]:
df_train.Product_Category_3.value_counts()

16.0    32636
15.0    28013
14.0    18428
17.0    16702
5.0     16658
8.0     12562
9.0     11579
12.0     9246
13.0     5459
6.0      4890
18.0     4629
4.0      1875
11.0     1805
10.0     1726
3.0       613
Name: Product_Category_3, dtype: int64

In [9]:
#We fill NA values for Product_category_3 and Product_Category_2 with zero
df_train.Product_Category_2.fillna(0,inplace=True)
df_train.Product_Category_3.fillna(0,inplace=True)

In [10]:
#df_train['n_prod'] = df_train.Product_ID.str.extract('(\d+)')

In [11]:
#df_train['n_prod'] = df_train['n_prod'].apply(lambda x : int(x))

In [12]:
df_train.drop(['Product_ID','User_ID'],axis=1, inplace=True)

In [13]:
df_train['Occupation'] = df_train['Occupation'].astype(str)
df_train['Product_Category_1'] = df_train['Product_Category_1'].astype(str)
df_train['Product_Category_2'] = df_train['Product_Category_2'].astype(str)
df_train['Product_Category_3'] = df_train['Product_Category_3'].astype(str)

In [14]:
df_train['Marital_Status'] = df_train['Marital_Status'].apply(lambda x: str(x))

In [15]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 10 columns):
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null object
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null object
Product_Category_1            550068 non-null object
Product_Category_2            550068 non-null object
Product_Category_3            550068 non-null object
Purchase                      550068 non-null int64
dtypes: int64(1), object(9)
memory usage: 42.0+ MB


In [16]:
target = df_train[['Purchase']]

In [17]:
train = df_train.drop('Purchase',axis=1)
del df_train

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 9 columns):
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null object
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null object
Product_Category_1            550068 non-null object
Product_Category_2            550068 non-null object
Product_Category_3            550068 non-null object
dtypes: object(9)
memory usage: 37.8+ MB


In [19]:
train1 = pd.get_dummies(train, drop_first=True)

In [20]:
del train
train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 85 columns):
Gender_M                         550068 non-null uint8
Age_18-25                        550068 non-null uint8
Age_26-35                        550068 non-null uint8
Age_36-45                        550068 non-null uint8
Age_46-50                        550068 non-null uint8
Age_51-55                        550068 non-null uint8
Age_55+                          550068 non-null uint8
Occupation_1                     550068 non-null uint8
Occupation_10                    550068 non-null uint8
Occupation_11                    550068 non-null uint8
Occupation_12                    550068 non-null uint8
Occupation_13                    550068 non-null uint8
Occupation_14                    550068 non-null uint8
Occupation_15                    550068 non-null uint8
Occupation_16                    550068 non-null uint8
Occupation_17                    550068 non-null uint8
Occupatio

In [21]:
target = np.sqrt(target)
target.skew()

Purchase   -0.139588
dtype: float64

##### Now we start training the model

In [22]:
X_train1,X_test1,Y_train1,Y_test1 = train_test_split(train1,target,test_size=0.30,random_state=2)
print("X_train1",X_train1.shape)
print("X_test1",X_test1.shape)
print("Y_train1",Y_train1.shape)
print("Y_test1",Y_test1.shape)

X_train1 (385047, 85)
X_test1 (165021, 85)
Y_train1 (385047, 1)
Y_test1 (165021, 1)


In [23]:
reg = Ridge()
params = [{'alpha':np.linspace(1,100,num=200)}]
grid_search = GridSearchCV(estimator=reg, param_grid=params,cv = 10, verbose = 0)
grid_search = grid_search.fit(X_train1,Y_train1)

In [24]:
grid_search.best_params_

{'alpha': 1.0}

In [25]:
y_pred1 = grid_search.predict(X_test1)
print("Mean Squared Error:",mean_squared_error(Y_test1,y_pred1))

Mean Squared Error: 244.5077939362574


In [26]:
from sklearn.metrics import r2_score
r2_sc1 = r2_score(Y_test1,y_pred1)
print("R2 score for model2 is:",r2_sc1)

R2 score for model2 is: 0.670613858802569


In [27]:
rf_reg = RandomForestRegressor()
params = {'criterion':['mse'],
          'max_features':['auto','sqrt','log2']}
grid_search2 = GridSearchCV(estimator=rf_reg,param_grid=params,cv=10,verbose=0)
grid_search2 = grid_search2.fit(X_train1,Y_train1)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


In [28]:
grid_search2.best_params_

{'criterion': 'mse', 'max_features': 'sqrt'}

In [29]:
y_pred2 = grid_search2.predict(X_test1)
print("Mean Squared Error:",mean_squared_error(Y_test1,y_pred2))

Mean Squared Error: 258.3374459266066


In [30]:
r2_sc2 = r2_score(Y_test1,y_pred2)
print("R2 score for model2 is:",r2_sc2)

R2 score for model2 is: 0.6519833864161058
