In [2]:
import numpy as np
import pandas as pd
import mglearn
import sys

from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier


from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error



# Read the train, test and sample submission datasets

In [3]:
train      = pd.read_csv("../data/train.csv")
test       = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/Sample_Submission_Tm9Lura.csv")

In [4]:
train.tail()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
550063,1006033,P00372445,M,51-55,13,B,1,1,20,,,368
550064,1006035,P00375436,F,26-35,1,C,3,0,20,,,371
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,,,137
550066,1006038,P00375436,F,55+,1,C,2,0,20,,,365
550067,1006039,P00371644,F,46-50,0,B,4+,1,20,,,490


In [5]:

# Label Encoding User_IDs
le = LabelEncoder()
train['User_ID'] = le.fit_transform(train['User_ID'])
test['User_ID'] = le.transform(test['User_ID'])

# Label Encoding Product_IDs
new_product_ids = list(set(pd.unique(test['Product_ID'])) - set(pd.unique(train['Product_ID'])))

le = LabelEncoder()
train['Product_ID'] = le.fit_transform(train['Product_ID'])
test.loc[test['Product_ID'].isin(new_product_ids), 'Product_ID'] = -1
new_product_ids.append(-1)

test.loc[~test['Product_ID'].isin(new_product_ids), 'Product_ID'] = le.transform(test.loc[~test['Product_ID'].isin(new_product_ids), 'Product_ID'])


In [6]:
frames = [train, test]
input = pd.concat(frames)
df = pd.DataFrame(input)
# display(df)

df.tail()

# print (input.shape)
# input.head()

Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
233594,26-35,B,F,1,15,8,,,1098,,4+,5886
233595,26-35,B,F,1,15,5,8.0,,2431,,4+,5886
233596,26-35,B,F,1,15,1,5.0,12.0,312,,4+,5886
233597,46-50,C,F,0,1,10,16.0,,1155,,4+,5887
233598,46-50,B,F,1,0,4,5.0,,3033,,4+,5889


In [7]:

#Replace missing values
df.fillna(999, inplace=True)
# df[df=="NaN"] = np.nan
df.shape

(783667, 12)

In [8]:
df.tail()

Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
233594,26-35,B,F,1,15,8,999.0,999.0,1098,999.0,4+,5886
233595,26-35,B,F,1,15,5,8.0,999.0,2431,999.0,4+,5886
233596,26-35,B,F,1,15,1,5.0,12.0,312,999.0,4+,5886
233597,46-50,C,F,0,1,10,16.0,999.0,1155,999.0,4+,5887
233598,46-50,B,F,1,0,4,5.0,999.0,3033,999.0,4+,5889


In [9]:
df.columns

Index(['Age', 'City_Category', 'Gender', 'Marital_Status', 'Occupation',
       'Product_Category_1', 'Product_Category_2', 'Product_Category_3',
       'Product_ID', 'Purchase', 'Stay_In_Current_City_Years', 'User_ID'],
      dtype='object')

In [10]:

# frames=[df,dummies]
# input = pd.concat(frames)
# data = pd.DataFrame(input)
for column in ["Age","City_Category","Gender","Stay_In_Current_City_Years"]:
    dummies = pd.get_dummies(df[column])
    df[dummies.columns] = dummies

In [11]:
del df['Age']
del df['City_Category']
del df['Gender']
del df['Stay_In_Current_City_Years']

In [12]:
df.columns

Index(['Marital_Status', 'Occupation', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Product_ID', 'Purchase',
       'User_ID', '0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+',
       'A', 'B', 'C', 'F', 'M', '0', '1', '2', '3', '4+'],
      dtype='object')

In [13]:
df.rename(columns={'0-17': 'Age_0-17','18-25':'Age_18-25',
                   '26-35':'Age_26-35','36-45':'Age_36-45',
                   '46-50':'Age_46-50','51-55':'Age_51-55','55+':'Age_55+',
                   '0':'Current_City_Years_0',
                   '1':'Current_City_Years_1',
                   '2':'Current_City_Years_2',
                   '3':'Current_City_Years_3',
                   '4+':'Current_City_Years_4+',
                   'A': 'City_Category_A', 'B': 'City_Category_B',
                   'C': 'City_Category_C','F':'Female','M':'Male'}, inplace=True)

In [14]:
print(df.shape)
df.tail()

(783667, 25)


Unnamed: 0,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,User_ID,Age_0-17,Age_18-25,...,City_Category_A,City_Category_B,City_Category_C,Female,Male,Current_City_Years_0,Current_City_Years_1,Current_City_Years_2,Current_City_Years_3,Current_City_Years_4+
233594,1,15,8,999.0,999.0,1098,999.0,5886,0,0,...,0,1,0,1,0,0,0,0,0,1
233595,1,15,5,8.0,999.0,2431,999.0,5886,0,0,...,0,1,0,1,0,0,0,0,0,1
233596,1,15,1,5.0,12.0,312,999.0,5886,0,0,...,0,1,0,1,0,0,0,0,0,1
233597,0,1,10,16.0,999.0,1155,999.0,5887,0,0,...,0,0,1,1,0,0,0,0,0,1
233598,1,0,4,5.0,999.0,3033,999.0,5889,0,0,...,0,1,0,1,0,0,0,0,0,1


In [15]:
target=df['Purchase']
data=df
del data['Purchase']

In [16]:
# dummies = pd.get_dummies(df['Product_ID'])
data.shape

(783667, 24)

In [17]:
# dummies.columns
df.dtypes

Marital_Status             int64
Occupation                 int64
Product_Category_1         int64
Product_Category_2       float64
Product_Category_3       float64
Product_ID                 int64
User_ID                    int64
Age_0-17                   uint8
Age_18-25                  uint8
Age_26-35                  uint8
Age_36-45                  uint8
Age_46-50                  uint8
Age_51-55                  uint8
Age_55+                    uint8
City_Category_A            uint8
City_Category_B            uint8
City_Category_C            uint8
Female                     uint8
Male                       uint8
Current_City_Years_0       uint8
Current_City_Years_1       uint8
Current_City_Years_2       uint8
Current_City_Years_3       uint8
Current_City_Years_4+      uint8
dtype: object

In [18]:
X_train, X_test, y_train, y_test = train_test_split( data,target, random_state=0, test_size=.5)

select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape)) 
print("X_train_selected.shape: {}".format(X_train_selected.shape))


X_train.shape: (391833, 24)
X_train_selected.shape: (391833, 12)


In [19]:
dtrain = xgb.DMatrix(X_train.values, label=y_train, missing=np.nan)

param = {'objective': 'reg:linear', 'booster': 'gbtree', 'silent': 1,
         'max_depth': 10, 'eta': 0.1, 'nthread': 4,
         'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 20,
         'max_delta_step': 0, 'gamma': 0}
num_round = 690


In [20]:

seeds = [1122, 2244, 3366, 4488, 5500]
test_preds = np.zeros((len(X_test), len(seeds)))

for run in range(len(seeds)):
    sys.stdout.write("\rXGB RUN:{}/{}".format(run+1, len(seeds)))
    sys.stdout.flush()
    param['seed'] = seeds[run]
    clf = xgb.train(param, dtrain, num_round)
    dtest = xgb.DMatrix(X_test.values, missing=np.nan)
    test_preds[:, run] = clf.predict(dtest)


XGB RUN:5/5

In [30]:
# test_preds = np.mean(test_preds, axis=1)
print(test_preds[:len(test_preds),:1])

[[14663.25      ]
 [ 6852.40332031]
 [10688.17773438]
 ...
 [ 6529.32568359]
 [ 5597.87207031]
 [ 3579.91503906]]


In [22]:
model_2 = ExtraTreesRegressor(n_estimators=1450, 
                              max_depth=8,
                              min_samples_split=10, 
                              min_samples_leaf=10, 
                              oob_score=True, 
                              n_jobs=6, 
                              random_state=123, 
                              verbose=1, 
                              bootstrap=True)
model_2.fit(X_train, y_train)


[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   34.7s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  2.7min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  6.2min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed: 11.5min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed: 17.3min
[Parallel(n_jobs=6)]: Done 1450 out of 1450 | elapsed: 19.6min finished


ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=8,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=10, min_samples_split=10,
          min_weight_fraction_leaf=0.0, n_estimators=1450, n_jobs=6,
          oob_score=True, random_state=123, verbose=1, warm_start=False)

In [23]:
model_2.predict(X_test)

[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    0.3s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    2.1s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:    5.7s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:   10.0s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:   15.4s
[Parallel(n_jobs=6)]: Done 1450 out of 1450 | elapsed:   18.1s finished


array([10474.73554531,  5686.12874548,  8633.78439504, ...,
        4623.94809654,  5921.41760019,  4692.27190477])

In [24]:
input.fillna(999, inplace=True)
input.drop(["Purchase"], axis=1, inplace=True)
input = input.applymap(str)
input.dtypes

Age                           object
City_Category                 object
Gender                        object
Marital_Status                object
Occupation                    object
Product_Category_1            object
Product_Category_2            object
Product_Category_3            object
Product_ID                    object
Stay_In_Current_City_Years    object
User_ID                       object
dtype: object

In [25]:

input = np.array(input)

for i in range(input.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(input[:,i]))
    input[:, i] = lbl.transform(input[:, i])

In [26]:
input = input.astype(int)

In [27]:
first_stage_rows = np.random.randint(train.shape[0], size = np.int(train.shape[0]/2))


In [29]:
X_train, X_test, y_train, y_test = train_test_split( data,target, random_state=0, test_size=.5)

select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

print("X_train.shape: {}".format(X_train.shape)) 
print("X_train_selected.shape: {}".format(X_train_selected.shape))

X_train.shape: (391833, 24)
X_train_selected.shape: (391833, 12)


In [31]:
# Model 3: 6/3000
model_3 = RandomForestRegressor(n_estimators=3000, max_depth=6, oob_score=True, n_jobs=6, random_state=123, min_samples_split=10, min_samples_leaf=10)
model_3.fit(X_train_selected, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=6,
           oob_score=True, random_state=123, verbose=0, warm_start=False)

In [32]:
# Model 4: 8/1500
model_4 = RandomForestRegressor(n_estimators=1500, max_depth=8, oob_score=True, n_jobs=6, random_state=123, min_samples_split=10, min_samples_leaf=10)
model_4.fit(X_train, y_train)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=6,
           oob_score=True, random_state=123, verbose=0, warm_start=False)

In [33]:
model_3_predict = model_3.predict(X_test_selected) #RF
model_4_predict = model_4.predict(X_test) #RF with feature engineering
model_2_predict = model_2.predict(X_test) #extretree regressor
model_1_predict=test_preds[:len(test_preds),:1]
model_5_predict=test_preds[:len(test_preds),1:2]
model_6_predict=test_preds[:len(test_preds),2:3]
model_7_predict=test_preds[:len(test_preds),3:4]
model_8_predict=test_preds[:len(test_preds),4:5]
# print(model_3_predict)

In [34]:
model_3_predict= model_3_predict.reshape(len(X_test), 1)
print(model_1_predict.shape,model_4_predict.shape,model_5_predict.shape,model_6_predict.shape,
      model_7_predict.shape,model_8_predict.shape,model_3_predict.shape)



(391834, 1) (391834,) (391834, 1) (391834, 1) (391834, 1) (391834, 1) (391834, 1)


In [69]:
print (np.sqrt(mean_squared_error(y_test, model_1_predict)))
# print (np.sqrt(mean_squared_error(y_test, model_2_predict)))
print (np.sqrt(mean_squared_error(y_test, model_3_predict)))
print (np.sqrt(mean_squared_error(y_test, model_4_predict)))
print (np.sqrt(mean_squared_error(y_test, model_5_predict)))
print (np.sqrt(mean_squared_error(y_test, model_6_predict)))
print (np.sqrt(mean_squared_error(y_test, model_8_predict)))

4901.040686846005
4891.540341919898
4871.772226731401
4898.943969963805
4902.870849181768
4902.851035641935


In [35]:
import warnings
train_with_meta = X_test
train_with_meta['model_1_predict'] = model_1_predict
train_with_meta['model_3_predict'] = model_1_predict
train_with_meta['model_4_predict'] = model_1_predict
train_with_meta['model_5_predict'] = model_1_predict
train_with_meta['model_6_predict'] = model_1_predict
train_with_meta['model_7_predict'] = model_1_predict
train_with_meta['model_8_predict'] = model_1_predict
train_with_meta['model_9_predict'] = model_1_predict
# train_with_meta['model_2_predict'] = model_1_predict
train_with_meta
warnings.simplefilter("ignore")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [36]:
train_with_meta

Unnamed: 0,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,User_ID,Age_0-17,Age_18-25,Age_26-35,...,Current_City_Years_3,Current_City_Years_4+,model_1_predict,model_3_predict,model_4_predict,model_5_predict,model_6_predict,model_7_predict,model_8_predict,model_9_predict
100465,1,0,1,2.0,999.0,1018,3471,0,0,1,...,0,0,14663.250000,14663.250000,14663.250000,14663.250000,14663.250000,14663.250000,14663.250000,14663.250000
343869,0,3,8,999.0,999.0,1069,4806,0,0,1,...,0,1,6852.403320,6852.403320,6852.403320,6852.403320,6852.403320,6852.403320,6852.403320,6852.403320
10504,1,7,6,10.0,999.0,3070,4005,0,0,0,...,1,0,10688.177734,10688.177734,10688.177734,10688.177734,10688.177734,10688.177734,10688.177734,10688.177734
452388,0,1,8,999.0,999.0,2898,3580,0,0,1,...,0,0,6335.572266,6335.572266,6335.572266,6335.572266,6335.572266,6335.572266,6335.572266,6335.572266
464201,1,7,6,999.0,999.0,2125,5365,0,0,0,...,0,0,11719.786133,11719.786133,11719.786133,11719.786133,11719.786133,11719.786133,11719.786133,11719.786133
166197,0,16,1,2.0,15.0,1183,1621,0,0,0,...,0,1,7417.380859,7417.380859,7417.380859,7417.380859,7417.380859,7417.380859,7417.380859,7417.380859
467905,1,4,1,2.0,5.0,530,45,0,0,1,...,1,0,8056.944824,8056.944824,8056.944824,8056.944824,8056.944824,8056.944824,8056.944824,8056.944824
203427,0,0,5,14.0,999.0,2674,1667,0,0,1,...,0,0,3620.463867,3620.463867,3620.463867,3620.463867,3620.463867,3620.463867,3620.463867,3620.463867
382399,0,1,8,999.0,999.0,1669,4696,0,0,1,...,0,0,6431.398438,6431.398438,6431.398438,6431.398438,6431.398438,6431.398438,6431.398438,6431.398438
75619,0,0,6,999.0,999.0,2272,5510,0,1,0,...,0,0,10179.659180,10179.659180,10179.659180,10179.659180,10179.659180,10179.659180,10179.659180,10179.659180


In [37]:
kfolds = KFold(train_with_meta.shape[0], n_folds=5)

In [51]:
dtrain = xgb.DMatrix(X_train.values, label=y_train, missing=np.nan)

param = {'objective': 'reg:linear', 'booster': 'gbtree', 'silent': 1,
         'max_depth': 10, 'eta': 0.1, 'nthread': 4, 'seed' = 0,
         'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 20,
         'max_delta_step': 0, 'gamma': 0}
num_round = 690


plst = list(params.items())
num_rounds = 1400
train_with_meta.reset_index(inplace=True)
train_with_meta.sort_index(inplace=True)

In [71]:
# for train_index, validation_index in kfolds:
    
#     train_X = train_with_meta.values[train_index, :]
#     validation_X= train_with_meta.values[validation_index, :]
#     train_y, validation_y = y_test[train_index], y_test[validation_index]
#     dtest = xgb.DMatrix(train_X, missing=np.nan)
#     watchlist = [(xgtrain, 'train')]
#     model_cv_xgboost = xgb.train(param, dtrain, num_round)
#     model_cv_predict = model_cv_xgboost.predict(xgb.DMatrix(validation_X))
#     print (np.sqrt(mean_squared_error(validation_y, model_cv_predict)))

In [47]:
train_with_meta
# xgtrain = xgb.DMatrix(train_with_meta, label=y_test)
# watchlist = [(xgtrain, 'train')]
# model_ss_xgboost = xgb.train(plst, xgtrain, num_rounds)

Unnamed: 0,index,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,User_ID,Age_0-17,Age_18-25,...,Current_City_Years_3,Current_City_Years_4+,model_1_predict,model_3_predict,model_4_predict,model_5_predict,model_6_predict,model_7_predict,model_8_predict,model_9_predict
0,0,0,10,3,999.0,999.0,672,0,1,0,...,0,0,8213.920898,8213.920898,8213.920898,8213.920898,8213.920898,8213.920898,8213.920898,8213.920898
1,5,1,1,2,3.0,15.0,3363,12,0,0,...,1,0,10217.560547,10217.560547,10217.560547,10217.560547,10217.560547,10217.560547,10217.560547,10217.560547
2,5,0,15,1,2.0,999.0,1831,2,0,0,...,1,0,14277.350586,14277.350586,14277.350586,14277.350586,14277.350586,14277.350586,14277.350586,14277.350586
3,6,1,7,1,8.0,17.0,1745,3,0,0,...,0,0,14038.875000,14038.875000,14038.875000,14038.875000,14038.875000,14038.875000,14038.875000,14038.875000
4,7,1,1,2,4.0,9.0,3577,12,0,0,...,1,0,8135.801270,8135.801270,8135.801270,8135.801270,8135.801270,8135.801270,8135.801270,8135.801270
5,7,1,7,1,15.0,999.0,3320,3,0,0,...,0,0,10960.817383,10960.817383,10960.817383,10960.817383,10960.817383,10960.817383,10960.817383,10960.817383
6,8,1,7,1,16.0,999.0,3604,3,0,0,...,0,0,11898.291992,11898.291992,11898.291992,11898.291992,11898.291992,11898.291992,11898.291992,11898.291992
7,9,1,20,8,999.0,999.0,2631,4,0,0,...,0,0,5311.163086,5311.163086,5311.163086,5311.163086,5311.163086,5311.163086,5311.163086,5311.163086
8,9,0,15,5,14.0,999.0,661,21,0,1,...,0,1,4662.635742,4662.635742,4662.635742,4662.635742,4662.635742,4662.635742,4662.635742,4662.635742
9,11,1,7,5,999.0,999.0,392,25,0,0,...,0,0,3507.032959,3507.032959,3507.032959,3507.032959,3507.032959,3507.032959,3507.032959,3507.032959


In [64]:
# model_ss_predict = model_ss_xgboost.predict(xgb.DMatrix(train_with_meta))

In [65]:
# np.max(model_ss_predict), np.min(model_ss_predict)