# Predicting Transaction Amount from Customer Demographics

In [16]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor, LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import ParameterGrid

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
np.set_printoptions(precision=2)
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.set_option('display.max_columns', 100) # Increase num of columns so they don't get cut off on the screen
pd.options.display.max_colwidth=100

In [17]:
# p_d_sales joins transaction records and customer demographics into one table
p_d_sales = pd.read_csv('trans_demographic.csv', quotechar='"', index_col='BASKET_ID', 
                        dtype= {'DAY': int, 'SALES_VALUE': np.float64, 'TRANS_TIME': int, 
                                'WEEK_NO':int,'COUPON_DISC':int, 'SUM(QUANTITY)': int,
                                'AGE_DESC':str,'MARITAL_STATUS_CODE': str,'INCOME_DESC':str, 
                                'HOMEOWNER_DESC':str, 'HH_COMP_DESC':str,  'HOUSEHOLD_SIZE_DESC':str, 
                                'KID_CATEGORY_DESC':str})
demo = pd.read_table('demographics.csv', delimiter=',')  # Customer demographics only
demo = demo.drop('household_key', axis=1)  # Household key not needed
agg_no_gas = pd.read_table('agg_no_gas.csv', delimiter=',')  # Trasaction list- grocery items only

In [22]:
p_d_sales[:5]

Unnamed: 0_level_0,DAY,TRANS_TIME,WEEK_NO,COUPON_DISC,SUM(QUANTITY),AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC
BASKET_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1364,1,1520,1,0,5,65+,B,100-124K,Homeowner,Single Female,1,None/Unknown
1130,1,1340,1,0,9,55-64,U,25-34K,Renter,2 Adults Kids,3,1
98,1,1937,1,0,8,35-44,U,35-49K,Unknown,1 Adult Kids,2,1
1172,1,946,1,0,19,25-34,B,50-74K,Unknown,Single Male,1,None/Unknown
1060,1,1251,1,0,57,35-44,B,35-49K,Renter,2 Adults No Kids,2,None/Unknown


# Structure the Data for Analysis

In [20]:
# Make a target column, and a version without the target. The target column is needed for regression.
target = p_d_sales['SALES_VALUE']
p_d_sales = p_d_sales.drop('SALES_VALUE', axis=1)

s_dumm = pd.get_dummies(p_d_sales)

# Add a column of 1's
s_dumm['1'] = 1.0
# Reorder the columns so the ones come first
s = s_dumm.sort_index(axis=1)

# Rename the target and make it a matrix
y = target.as_matrix()
x = s.as_matrix()

### Split the data into train/test sets

In [23]:
# Split into 80/20 training and test data for training and testing sets and targets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33)
s_names = list(s.columns.values)

# Regression

### Simple Linear Regression

In [40]:
# Fit the regression model
linreg = LinearRegression()
linreg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [41]:
# Predict the test set values
linreg.predict(x_test)

array([ 21.16,  42.25,  38.45, ...,  32.93,  19.74,  31.91])

In [42]:
# Compute RMSE on training data
p = linreg.predict(x_test)

# Create a vector of errors
err = abs(p-y_test)

In [43]:
# Dot product of error vector with itself gives us the sum of squared errors
total_error = np.dot(err,err)

# Compute RMSE
rmse_train = np.sqrt(total_error/len(p))
print rmse_train

37.475206567


In [46]:
# Compute RMSE using 10-fold cross-validation
kf = KFold(len(s), n_folds=10)
xval_err = 0
for train,test in kf:
    linreg.fit(x[train],y[train])
    # p = np.array([linreg.predict(xi) for xi in x[test]])
    p = linreg.predict(x[test])
    e = p-y[test]
    xval_err += np.dot(e,e)
    
rmse_10cv = np.sqrt(xval_err/len(s))

In [47]:
method_name = 'Linear Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)

Method: Linear Regression
RMSE on training: 37.4752
RMSE on 10-fold CV: 37.3340


## Ridge Regression

In [50]:
# Create linear regression object with a ridge coefficient 0.5
ridge = Ridge(fit_intercept=True, alpha=0.5)

# Train the model using the training set
ridge.fit(x,y)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

In [51]:
#  Predict transaction amount
p = ridge.predict(x)

# Compute RMSE on training data
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))

# Compute RMSE using 10-fold x-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
    ridge.fit(x[train],y[train])
    p = ridge.predict(x[test])
    e = p-y[test]
    xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))

method_name = 'Ridge Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)

Method: Ridge Regression
RMSE on training: 37.3021
RMSE on 10-fold CV: 37.3340


### Ridge Gridsearch

In [52]:
# Use an iterative search to find the best parameter values
params = {
    'alpha': [0.0001,0.005,0.001,0.005, 0.01,0.05, 0.1,0.5, 1, 2, 5, 10, 20, 40, 60]}
#print ParameterGrid(params)

ridge = Ridge()

gs = GridSearchCV(ridge, params, cv=5)  # Using 5-fold cross validation

gs.fit(x_train, y_train)
gs.best_params_, gs.best_score_

({'alpha': 20}, 0.051178103423076458)

### Ridge with Optimal Parameters

In [53]:
ridge = Ridge(fit_intercept=True, alpha=20)

# Train the model using the training set
ridge.fit(x,y)

# Compute RMSE on training data
p = ridge.predict(x)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))

# Compute RMSE using 10-fold x-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
    ridge.fit(x[train],y[train])
    p = ridge.predict(x[test])
    e = p-y[test]
    xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))

method_name = 'Ridge Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)

Method: Ridge Regression
RMSE on training: 37.3021
RMSE on 10-fold CV: 37.3338


## SGD

In [54]:
# Scale the variables before performing SGD

scaler = StandardScaler()
scaler.fit(x)
x_s = scaler.transform(x)

sgdreg = SGDRegressor(penalty='l2', alpha=0.15, n_iter=200)

# Compute RMSE on training data
sgdreg.fit(x_s,y)
p = sgdreg.predict(x_s)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))

# Compute RMSE using 10-fold x-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
    scaler = StandardScaler()
    scaler.fit(x[train])  # Don't cheat - fit only on training data
    xtrain_s = scaler.transform(x[train])
    xtest_s = scaler.transform(x[test])  # apply same transformation to test data
    sgdreg.fit(xtrain_s,y[train])
    p = sgdreg.predict(xtest_s)
    e = p-y[test]
    xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))

method_name = 'Stochastic Gradient Descent Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)

Method: Stochastic Gradient Descent Regression
RMSE on training: 37.4494
RMSE on 10-fold CV: 37.4453


### SGD- GridSearch

In [27]:
# Find the best combination of parameters to use for SGD
params = {
    'penalty': ['l1','l2'], 
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 2, 5, 10, 20],
    'n_iter': [200,]}

sgdreg = SGDRegressor()

gs = GridSearchCV(sgdreg, params, cv=5)

gs.fit(x_train, y_train)
gs.best_params_, gs.best_score_

({'alpha': 0.1, 'n_iter': 200, 'penalty': 'l2'}, 0.01867646608986127)

### Optimal SGD

In [56]:
sgdreg = SGDRegressor(penalty='l2', alpha=0.001, n_iter=200)

# Compute RMSE on training data
sgdreg.fit(x_s,y)
p = sgdreg.predict(x_s)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))

# Compute RMSE using 10-fold x-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
    scaler = StandardScaler()
    scaler.fit(x[train])  # Don't cheat - fit only on training data
    xtrain_s = scaler.transform(x[train])
    xtest_s = scaler.transform(x[test])  # apply same transformation to test data
    sgdreg.fit(xtrain_s,y[train])
    p = sgdreg.predict(xtest_s)
    e = p-y[test]
    xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))

method_name = 'Stochastic Gradient Descent Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)

Method: Stochastic Gradient Descent Regression
RMSE on training: 37.4013
RMSE on 10-fold CV: 37.4205


## Lasso

In [124]:
lasso = Lasso(fit_intercept=True, alpha=0.5)

# Train the model using the training set
lasso.fit(x,y)

# Compute RMSE on training data
# p = np.array([ridge.predict(xi) for xi in x])
p = lasso.predict(x)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))

# Compute RMSE using 10-fold x-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
    lasso.fit(x[train],y[train])
    p = lasso.predict(x[test])
    e = p-y[test]
    xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))

method_name = 'Ridge Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)

Method: Ridge Regression
RMSE on training: 37.8794
RMSE on 10-fold CV: 37.8864


### Lasso Gridsearch

In [None]:
params = {
    'alpha': [0.0001,0.005,0.001,0.005, 0.01,0.05, 0.1,0.5, 1, 2, 5, 10, 20, 40, 60]}
#print ParameterGrid(params)

lasso = Lasso()

gs = GridSearchCV(lasso, params, cv=5)

gs.fit(x_train, y_train)
gs.best_params_, gs.best_score_

### Lasso Optimal

In [125]:
lasso = Lasso(fit_intercept=True, alpha=0.001)

# Train the model using the training set
lasso.fit(x,y)

# Compute RMSE on training data
p = lasso.predict(x)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))

# Compute RMSE using 10-fold x-validation
kf = KFold(len(x), n_folds=10)
xval_err = 0
for train,test in kf:
    lasso.fit(x[train],y[train])
    p = lasso.predict(x[test])
    e = p-y[test]
    xval_err += np.dot(e,e)
rmse_10cv = np.sqrt(xval_err/len(x))

method_name = 'Lasso Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)

Method: Ridge Regression
RMSE on training: 37.3629
RMSE on 10-fold CV: 37.3862
