In [78]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

In [66]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [67]:
pd.set_option("display.max_columns", 50)

In [68]:
def dummy_categories(df):
    
    '''
    Since a product can be categorized up to 3 categories, create dummy variables
    and identify which categories the products are in.
    '''
    
    cat1 = pd.get_dummies(df.Product_Category_1)
    cat2 = pd.get_dummies(df.Product_Category_2)
    cat3 = pd.get_dummies(df.Product_Category_3)
    
    cat1 = cat1.add(cat2, fill_value = 0).add(cat3, fill_value = 0).astype('int')
    
    if 19 not in cat1.columns:
        cat1[19] = 0
        cat1[20] = 0
    return cat1

In [69]:
def transform_categories(df):
    
    '''
    Create dummy categories and remove old product categories from data frame.
    '''
    
    cats = dummy_categories(df)
    df = df.drop(['Product_Category_1','Product_Category_2','Product_Category_3'], axis = 1)
    
    df = pd.concat([df,cats],axis = 1)
    
    le = LabelEncoder()
    df = df.apply(le.fit_transform)
    return df

In [70]:
train_labels = train.Purchase
train_features = train.drop(['Purchase'], axis = 1)

In [71]:
train_features = transform_categories(train_features)
test = transform_categories(test)

In [72]:
train_features.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,0,672,0,0,10,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,2376,0,0,10,0,2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,852,0,0,10,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,828,0,0,10,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
4,1,2734,1,6,16,2,4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [73]:
train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels, test_size = 0.25, random_state = 111)

In [74]:
clf = LinearRegression()
clf.fit(train_features, train_labels)
clfpredict = clf.predict(test_features)

In [75]:
rmse = mean_squared_error(test_labels, clfpredict)
print(rmse)

13161425.4579


In [76]:
clf = DecisionTreeRegressor(min_samples_split = 200, random_state = 111)
clf.fit(train_features, train_labels)
clfpredict = clf.predict(test_features)

In [77]:
rmse = mean_squared_error(test_labels, clfpredict)
print(rmse)

7381714.20734


In [None]:
clf = RandomForestRegressor(n_estimators = 1000, random_state = 111)
clf.fit(train_features, train_labels)
clfpredict = clf.predict(test_features)

In [None]:
rmse = mean_squared_error(test_labels, clfpredict)
print(rmse)