In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.isna().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [4]:
set(list(df.Product_Category_2.unique()))

{nan,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0}

In [5]:
df.Product_Category_2.fillna(18, inplace = True)

In [6]:
set(list(df.Product_Category_3.unique()))

{nan,
 3.0,
 4.0,
 5.0,
 6.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0}

In [7]:
df.Product_Category_3.fillna(18, inplace = True)

In [8]:
df.isna().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            550068 non-null float64
Product_Category_3            550068 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [10]:
y = df['Purchase']

In [11]:
X = df.drop(columns = ['Purchase'], axis = 1)

In [12]:
X

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000001,P00069042,F,0-17,10,A,2,0,3,18.0,18.0
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,18.0,18.0
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,18.0
4,1000002,P00285442,M,55+,16,C,4+,0,8,18.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,18.0,18.0
550064,1006035,P00375436,F,26-35,1,C,3,0,20,18.0,18.0
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,18.0,18.0
550066,1006038,P00375436,F,55+,1,C,2,0,20,18.0,18.0


In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 11 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            550068 non-null float64
Product_Category_3            550068 non-null float64
dtypes: float64(2), int64(4), object(5)
memory usage: 46.2+ MB


In [14]:
le = LabelEncoder()

In [15]:
X = X.apply(le.fit_transform)

In [16]:
X.Gender = pd.to_numeric(X.Gender)

In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 11 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null int64
Gender                        550068 non-null int64
Age                           550068 non-null int64
Occupation                    550068 non-null int64
City_Category                 550068 non-null int64
Stay_In_Current_City_Years    550068 non-null int64
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            550068 non-null int64
Product_Category_3            550068 non-null int64
dtypes: int64(11)
memory usage: 46.2 MB


In [18]:
scaler = StandardScaler()

In [19]:
X_scaled = scaler.fit_transform(X)

In [124]:
pca = PCA(n_components = 'mle', svd_solver='full', random_state = 12)

In [96]:
principal_components = pca.fit_transform(X)

In [97]:
pca.explained_variance_ratio_

array([7.35038107e-01, 2.64934920e-01, 1.11947147e-05, 1.09667649e-05,
       2.32735131e-06, 1.32968158e-06, 4.77176859e-07, 4.29324636e-07,
       1.45826956e-07, 5.56866110e-08])

In [98]:
component_df = pd.DataFrame(principal_components, columns = ['Component_1','Component_2','Component_3','Component_4',
                                                            'Component_5','Component_6','Component_7','Component_8','Component_9',
                                                            'Component_10'])

In [99]:
component_df

Unnamed: 0,Component_1,Component_2,Component_3,Component_4,Component_5,Component_6,Component_7,Component_8,Component_9,Component_10
0,2931.106545,-1084.784323,-3.755083,3.513769,-4.592146,1.170579,-2.491168,-0.093475,0.742045,0.034873
1,2959.686566,618.966809,8.454643,-2.121649,-0.984129,-0.015147,-2.474407,-0.091585,0.805718,0.036050
2,2934.125509,-904.807071,-7.215248,5.058641,3.444470,0.129217,-2.694777,-0.091729,0.732156,0.034335
3,2933.722934,-928.805878,-4.207689,3.692769,4.831858,-1.619860,-2.694272,-0.094633,0.741054,0.034295
4,2964.691739,976.943761,-1.942808,9.328395,0.074901,0.848868,3.220366,2.226064,-0.607122,0.838903
...,...,...,...,...,...,...,...,...,...,...
550063,-2902.509842,1908.484876,-7.177675,8.895954,10.731911,-0.429887,2.018691,-0.691649,0.285406,-0.241198
550064,-2904.493793,1909.517645,-12.126180,-2.091589,10.660653,-0.419840,-0.828626,1.254677,-1.008258,0.307306
550065,-2905.492322,1909.535023,-6.340425,10.657222,10.681197,-0.415408,-1.133592,2.130967,0.049542,-0.698469
550066,-2907.493499,1909.568097,-12.136456,-2.005063,10.735486,-0.443231,3.165991,0.462673,-0.640533,0.822967


In [125]:
kf= KFold(20, shuffle = True, random_state= 10)

In [126]:
for a,b in kf.split(component_df):
    X_train, X_test = X_scaled[a], X_scaled[b]
    y_train, y_test = y[a], y[b]

### Random Froest Regressor

In [127]:
rand_for = RandomForestRegressor()

In [128]:
rand_for.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [129]:
print('RandomForestRegression score: ',rand_for.score(X_train, y_train))

RandomForestRegression score:  0.9586354174152318


In [130]:
y_pred = rand_for.predict(X_train)
y_pred = y_pred.reshape(-1,1)

In [131]:
list(rand_for.feature_importances_)

[0.1136571764979346,
 0.08104965225302742,
 0.009250322564966704,
 0.029107150622874972,
 0.04858606384445683,
 0.015078846123758738,
 0.029587433200866556,
 0.010772649266669607,
 0.6366330214926708,
 0.01361012880300886,
 0.012667555329765056]

In [132]:
rand_for.score(X_test, y_test)

0.709979107090704

In [133]:
y_actual_pred = rand_for.predict(X_test)

In [134]:
rmse = mean_squared_error(y_test, y_actual_pred)
rmse

7325649.790293389

In [135]:
rmse = np.sqrt(rmse)
rmse

2706.59376159286

In [136]:
rand_for = RandomForestRegressor(n_estimators=200, random_state=10)

In [137]:
rand_for.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=10, verbose=0, warm_start=False)

In [138]:
print('RandomForestRegression score: ',rand_for.score(X_train, y_train))

RandomForestRegression score:  0.959462168414961


In [139]:
rand_for.score(X_test, y_test)

0.7108608789434196

In [140]:
y_pred = rand_for.predict(X_test)

In [141]:
mse = mean_squared_error(y_test, y_pred)

In [142]:
rmse = np.sqrt(mse)
rmse

2702.476101357789

In [52]:
import pickle

In [106]:
filename = '2730.38_rmse_model'

In [107]:
pickle.dump(rand_for,open(filename,"wb"))

In [155]:
list(y_test)[:10]

[8756, 12449, 12615, 8077, 7007, 6867, 1432, 11753, 6032, 7908]

In [156]:
list(y_pred)[:10]

[8440.692809381395,
 8367.101605607359,
 4951.015124873681,
 7869.421233077377,
 8831.185830509476,
 8251.338002005627,
 9230.26587376296,
 11956.522526609177,
 8313.766451509026,
 7372.802372420858]

### Decision Tree Regressor

In [114]:
tree = DecisionTreeRegressor()

In [115]:
for_test = tree.fit(X_train, y_train)

In [116]:
for_test.score(X_test, y_test)

0.5085571773764281

### Polynomial Regression

In [139]:
lr = LinearRegression()

In [140]:
poly = PolynomialFeatures()

In [141]:
poly.fit(X_train)

PolynomialFeatures(degree=2, include_bias=True, interaction_only=False,
                   order='C')

In [142]:
lr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [143]:
lr.score(X_test,y_test)

0.2605897609543051

In [144]:
y_pred = lr.predict(X_test)

In [145]:
mean_squared_error(y_test,y_pred)

21070644.7713989

In [146]:
np.sqrt(_)

4590.277199843044