In [1]:
# Import the libraries for data visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the Pipeline
from sklearn.pipeline import Pipeline

# Import the OneHotEncoder to convert all the categorical columns
from sklearn.preprocessing import OneHotEncoder

# Import the libraries for bulding the model
from sklearn.linear_model import LinearRegression

# Import the libraries for testing and metric measurements
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Import the final working data
final_df = pd.read_csv('final_data.csv', index_col = 0)

# Import the final test data
final_test = pd.read_csv('final_test_data.csv',index_col = 0)

  mask |= (ar1 == a)


In [3]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2936211 entries, 0 to 2936210
Data columns (total 11 columns):
date                   object
date_block_num         int64
shop_id                int64
item_id                int64
item_price             float64
item_cnt_day           float64
item_category_id       int64
month                  int64
year                   int64
Shop Group             int64
Item Category Group    int64
dtypes: float64(2), int64(8), object(1)
memory usage: 268.8+ MB


In [4]:
# Define the categorical columns of interest
CATEGORICAL = ['shop_id', 'item_id', 'item_category_id', 'month', 'year',
               'Shop Group', 'Item Category Group']

In [5]:
# Convert the categorical columns into the category datatype for faster computational time
final_df[CATEGORICAL] = final_df[CATEGORICAL].apply(lambda x: x.astype('category'))

In [6]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2936211 entries, 0 to 2936210
Data columns (total 11 columns):
date                   object
date_block_num         int64
shop_id                category
item_id                category
item_price             float64
item_cnt_day           float64
item_category_id       category
month                  category
year                   category
Shop Group             category
Item Category Group    category
dtypes: category(7), float64(2), int64(1), object(1)
memory usage: 135.2+ MB


In [7]:
# Define the variables of interest
group_df = final_df[['shop_id', 'item_id', 'month', 'year', 'item_cnt_day']]

# Split the data into X (variables) and y (target)
X = group_df.drop('item_cnt_day', axis=1)
y = group_df['item_cnt_day']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=21, stratify=group_df['month'])

In [8]:
# Initiate the linear regression classifier
lrClassifier = LinearRegression()

# Initiate the the OneHotEncoder
encoder = OneHotEncoder()

# Define the pipeline
pipeline = Pipeline(steps=[
    ('encoder', encoder), 
     ('lr', lrClassifier)])

In [9]:
# Fit the pipeline with the training data
lrModel = pipeline.fit(X_train, y_train)

In [10]:
# Assign the predictions to variables
predicted_train_values = lrModel.predict(X_train)
predicted_test_values = lrModel.predict(X_test)

In [11]:
# Let's look at the RMSE and R^2 value for the model with the training data
trainRMSE = np.sqrt(mean_squared_error(y_train, predicted_train_values))

print ('Training Data    \nRoot Mean Squared Error:', trainRMSE)
print ('R^2:', lrModel.score(X_train, y_train))

Training Data    
Root Mean Squared Error: 2.5804664552462677
R^2: 0.0777424299099232


In [12]:
# Let's look at the RMSE and R^2 value for the model with the test data
testRMSE = np.sqrt(mean_squared_error(y_test, predicted_test_values))

print ('Test Data    \nRoot Mean Squared Error:', testRMSE)
print ('R^2:', lrModel.score(X_test, y_test))

Test Data    
Root Mean Squared Error: 2.196463345720754
R^2: 0.10770182229461744


In [13]:
# Restructure the test data for columns of interest
final_test = final_test[['shop_id', 'item_id', 'month', 'year']]

In [14]:
# Make a prediction with the pipeline model
pipelineResult = lrModel.predict(final_test)
pipeResult = pd.DataFrame(pipelineResult, columns=['item_cnt_month'])
pipeResult.to_csv('pipelineResult.csv')