In [1]:
# Import the libraries for data visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the Pipeline
from sklearn.pipeline import Pipeline

# Import the OneHotEncoder to convert all the categorical columns; Note that this can only be done with int type columns
from sklearn.preprocessing import OneHotEncoder

# # Import the Imputer to fill all missing values
# from sklearn.preprocessing import Imputer

# Import the libraries for bulding the model
from sklearn.linear_model import LinearRegression

# Import the libraries for testing and metric measurements
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Import the final working data
final_df = pd.read_csv('final_data.csv', index_col = 0)

# Import the final test data
final_test = pd.read_csv('final_test_data.csv',index_col = 0)

  mask |= (ar1 == a)


In [3]:
# Define the variables of interest
interest_df = final_df[['shop_id', 'item_id', 'month', 'year', 'item_cnt_day']]

# Group the data together to get months and take the mean to reduce the effects of the outliers.
group_df = interest_df.groupby(['year', 'month', 'shop_id', 'item_id'], as_index=False).sum()

# Fill all the nan values in the item_cnt_day column with 0's
# group_df['item_cnt_day'].fillna(0, inplace=True)

#Clip the results between 0 to 20 because the remainder is likely an anamoly
group_df['item_cnt_day'].clip(0, 20, inplace=True)

# Split the data into X (variables) and y (target)
X = group_df.drop('item_cnt_day', axis=1)
y = group_df['item_cnt_day']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=21, stratify=group_df['month'])

In [4]:
# Restructure the test data for columns of interest
final_test = final_test[['year', 'month', 'shop_id', 'item_id']]

In [5]:
group_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1609487 entries, 0 to 1609486
Data columns (total 5 columns):
year            1609487 non-null int64
month           1609487 non-null int64
shop_id         1609487 non-null float64
item_id         1609487 non-null int64
item_cnt_day    1609487 non-null float64
dtypes: float64(2), int64(3)
memory usage: 113.7 MB


In [6]:
# Define a function to calculate root mean squared error using lambda
RMSE = (lambda x, y: np.sqrt(mean_squared_error(x, y)))

In [7]:
# Initiate the linear regression classifier
lrClassifier = LinearRegression()

# Initiate the OneHotEncoder
encoder = OneHotEncoder()


# Define the pipeline
lrpipeline = Pipeline(steps=[
    ('encoder', encoder), 
     ('lr', lrClassifier)])

In [8]:
# Fit the pipeline with the training data
lrModel = lrpipeline.fit(X_train, y_train)

In [9]:
# Assign the predictions to variables
predicted_train_values = lrModel.predict(X_train)
predicted_test_values = lrModel.predict(X_test)

In [10]:
# Let's look at the RMSE and R^2 value for the model with the training data
trainRMSE = RMSE(y_train, predicted_train_values)

print ('Training Data    \nRoot Mean Squared Error:', trainRMSE)
print ('R^2:', lrModel.score(X_train, y_train))

Training Data    
Root Mean Squared Error: 2.0683230830555863
R^2: 0.35584898277305343


In [11]:
# Let's look at the RMSE and R^2 value for the model with the test data
testRMSE = RMSE(y_test, predicted_test_values)

print ('Test Data    \nRoot Mean Squared Error:', testRMSE)
print ('R^2:', lrModel.score(X_test, y_test))

Test Data    
Root Mean Squared Error: 2.090890961125138
R^2: 0.34339069238455144


In [12]:
# Make a prediction with the pipeline model
pipelineResult = lrModel.predict(final_test)
lrpipeResult = pd.DataFrame(pipelineResult, columns=['item_cnt_month'])
lrpipeResult.to_csv('pipelineResult.csv')

Let's try the SGDRegressor (best for working with large datasets like this) and XGBRegresor from xgboost.

In [13]:
# Let's try the recommended algorithm for working with large datasets in regression according to:
# http://scikit-learn.org/stable/tutorial/machine_learning_map/

# Import SGDRegressor
from sklearn.linear_model import SGDRegressor

# Initiate the OneHotEncoder
encoder = OneHotEncoder()

# Instantiate the regressor with default parameters
sgd_reg = SGDRegressor()

# Apply the classifier into the pipeline with the OneHotEncoder step
sgdPipeline = Pipeline(steps=[
        ('encoder', encoder),
        ('reg', sgd_reg)])

In [14]:
# Fit the pipeline with the training data.
sgdModel = sgdPipeline.fit(X_train, y_train)



In [15]:
# Assign the predictions to variables
predicted_train_values = sgdModel.predict(X_train)
predicted_test_values = sgdModel.predict(X_test)

In [16]:
# Let's look at the RMSE and R^2 value for the model with the training data
trainRMSE = RMSE(y_train, predicted_train_values)

print ('Training Data    \nRoot Mean Squared Error:', trainRMSE)
print ('R^2:', sgdModel.score(X_train, y_train))

# Let's look at the RMSE and R^2 value for the model with the test data
testRMSE = RMSE(y_test, predicted_test_values)

print ('Test Data    \nRoot Mean Squared Error:', testRMSE)
print ('R^2:', sgdModel.score(X_test, y_test))

Training Data    
Root Mean Squared Error: 2.3209058648989775
R^2: 0.18891570682426706
Test Data    
Root Mean Squared Error: 2.322461802992094
R^2: 0.18989479178500412


We can see the model only performs slightly better than the linear model. However, it is much faster.

In [17]:
# Make a prediction with the pipeline model
pipelineResult = sgdModel.predict(final_test)
sgdpipeResult = pd.DataFrame(pipelineResult, columns=['item_cnt_month'])
sgdpipeResult.to_csv('sgdpipelineResult.csv')

For both the linear regression and SGDRegressor algorithms, the score 1.45. I don't expecting tuning the paramets would lower the score an additional 0.45. As a result, I will try XGBoost here.

An additional thing to note is that the score is higher, the lower the predictions are. It appears that the predicted values are overestimated the real values. This is why the clip(0,20) yields a higher score than without. Additionally, fitting without grouping (item_cnt_day), the model performs better

In [18]:
# Import xgboost as xgb
import xgboost as xgb

# Instantiate the XGBRegressor
xg_reg = xgb.XGBRegressor(objective='reg:linear')

# Instantiate the OneHotEncoder
encoder = OneHotEncoder()

# Apply the regressor into the pipeline with the OneHotEncoder step
xgbPipeline = Pipeline(steps=[
        ('encoder', encoder),
        ('reg', xg_reg)])

In [19]:
# Fit the pipeline with the training data.
xgbModel = xgbPipeline.fit(X_train, y_train)

In [20]:
# Assign the predictions to variables
predicted_train_values = xgbModel.predict(X_train)
predicted_test_values = xgbModel.predict(X_test)

In [21]:
# Let's look at the RMSE and R^2 value for the model with the training data
trainRMSE = RMSE(y_train, predicted_train_values)

print ('Training Data    \nRoot Mean Squared Error:', trainRMSE)
print ('R^2:', xgbModel.score(X_train, y_train))

# Let's look at the RMSE and R^2 value for the model with the test data
testRMSE = RMSE(y_test, predicted_test_values)

print ('Test Data    \nRoot Mean Squared Error:', testRMSE)
print ('R^2:', xgbModel.score(X_test, y_test))

Training Data    
Root Mean Squared Error: 2.4056053724645907
R^2: 0.12863580733924096
Test Data    
Root Mean Squared Error: 2.407892950538366
R^2: 0.12919961003012925


In [22]:
pipelineResult = xgbModel.predict(final_test)
xgbpipeResult = pd.DataFrame(pipelineResult, columns=['item_cnt_month'])
xgbpipeResult.to_csv('xgbpipelineResult.csv')

XGBoost doesn't perform any better than SGDRegressor so I'll try some hyperparameter tuning here, particularly max_depth and learning_rate/eta.

In [23]:
sales_dmatrix = xgb.DMatrix(X_train, y_train)

In [24]:
# Create the initial parameter dictionary
params = {'objective':'reg:linear'}

# Define the paramters for max_depths
max_depths = [2, 5, 10]

# Empty list to store the rmse of each fitting
rmse_scores = []

# Iterate over the max_depths list
for val in max_depths:
    
    # Add the max_depth value to the parameters
    params['max_depth'] = val
    
    # Pass the param to the cv
    cv_results = xgb.cv(dtrain=sales_dmatrix, params=params, nfold=3,
                       num_boost_round=5, metrics='rmse', as_pandas=True)
    
    # Append the best scores to the empty list
    rmse_scores.append(cv_results['test-rmse-mean'].tail(1).values[0])
    
# Print the scores
print (pd.DataFrame(list(zip(max_depths, rmse_scores)), columns=['max_depth', 'rmse']))

   max_depth      rmse
0          2  2.564907
1          5  2.478322
2         10  2.299199


In [25]:
# Create the initial parameter dictionary
params = {'objective':'reg:linear', 'max_depth': 10}

# Define the paramters for max_depths
eta_vals = [0.01, 0.1, 0.2, 0.3]

# Empty list to store the rmse of each fitting
rmse_scores = []

# Iterate over the num_rounds list
for val in eta_vals:
    
    # Add the eta value to the parameters
    params['eta'] = val
    
    # Pass the param to the cv
    cv_results = xgb.cv(dtrain=sales_dmatrix, params=params, nfold=3,
                       num_boost_round=10, metrics='rmse', as_pandas=True)
    
    # Append the best scores to the empty list
    rmse_scores.append(cv_results['test-rmse-mean'].tail(1).values[0])
    
# Print the scores
print (pd.DataFrame(list(zip(eta_vals, rmse_scores)), columns=['num_rounds', 'rmse']))

   num_rounds      rmse
0        0.01  2.890134
1        0.10  2.424642
2        0.20  2.258828
3        0.30  2.174129


In [26]:
# Instantiate the XGBRegressor with the new parameters
xgb_reg = xgb.XGBRegressor(objective='reg:linear',
                           max_depth=10, learning_rate=0.3)

# Instantiate the OneHotEncoder
encoder = OneHotEncoder()

# Apply the regressor into the pipeline with the OneHotEncoder step
xgbPipeline = Pipeline(steps=[
        ('encoder', encoder),
        ('reg', xgb_reg)])

In [27]:
# Fit the pipeline with the training data.
xgbModel = xgbPipeline.fit(X_train, y_train)

In [28]:
# Assign the predictions to variables
predicted_train_values = xgbModel.predict(X_train)
predicted_test_values = xgbModel.predict(X_test)

# Let's look at the RMSE and R^2 value for the model with the training data
trainRMSE = RMSE(y_train, predicted_train_values)

print ('Training Data    \nRoot Mean Squared Error:', trainRMSE)
print ('R^2:', xgbModel.score(X_train, y_train))

# Let's look at the RMSE and R^2 value for the model with the test data
testRMSE = RMSE(y_test, predicted_test_values)

print ('Test Data    \nRoot Mean Squared Error:', testRMSE)
print ('R^2:', xgbModel.score(X_test, y_test))

Training Data    
Root Mean Squared Error: 1.9961284916948256
R^2: 0.4000322223128612
Test Data    
Root Mean Squared Error: 2.0470695581207794
R^2: 0.3706250342674622


In [29]:
# Tuning the parameters reduced the RMSE by approximate 0.4

#Load the data in
pipelineResult = xgbModel.predict(final_test)
xgbpipeResult = pd.DataFrame(pipelineResult, columns=['item_cnt_month'])
xgbpipeResult.to_csv('xgbtuned_pipelineResult.csv')

At this point, I would like to try more complex algorithms for my model. First I will try SupportVectorRegression from sklearn.svm. However, since a lot of these more complex algorithms are computationally intensive, I will try to reduce the amount of values to work with, or I will only use a small section of the total data for training.

The following algorithms are too complex and computationally intensive to be used on the large dataset. As a result, I will be using a subset of the data for training. First we'll redefine the X, and y data then split it using 20% training data and 80% test data. This gives us approximately 320,000 entries for the training data.

The 20-80 split didn't work for the DecisionTreeClassifier, so I expect it not to work with the rest of the more complex algorithms. The 10-90 split does work, but the split leaves single entries out of the training data. I won't be exploring these algorithms any further

In [30]:
# # Import the DecisionTreeClassifier from sklearn.tree
# from sklearn.tree import DecisionTreeClassifier

# # Initiate the OneHotEncoder
# encoder = OneHotEncoder()

# # Instantiate the classifier with default parameters
# clf_dt = DecisionTreeClassifier()

# # Apply the classifier into the pipeline with the OneHotEncoder step
# dtPipeline = Pipeline(steps=[
#         ('encoder', encoder),
#         ('clf', clf_dt)])

In [31]:
# # Fit the pipeline with the training data
# dtModel = dtPipeline.fit(X_train, y_train)

In [32]:
# # Import the SupportVectorRegression function from SVM for polynomial regression
# from sklearn import svm

# # Initiate the SVR classifier with default parameters
# svr = svm.SVR()

In [33]:
# # Import the RandomForestRegressor from sklearn.ensemble
# from sklearn.ensemble import RandomForestRegressor

# # Instantiate the classifier with default parameters
# rf = RandomForestRegressor()

# # Apply the regressor into a pipeline with the OneHotEncoder step
# rfPipeline = Pipeline(steps=[
#         ('encoder', encoder),
#         ('rf', rf)])

In [34]:
# # Fit the pipeline with the training data
# rfmodel = rfPipeline.fit(X_train, y_train)