Model Deployment

In [3]:
#Importing necessary libraries
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

#Data Loading
df = pd.read_csv('onlineRetail.csv')

#Calculate Total Sales
df['TotalSales'] = df['Quantity'] * df['UnitPrice']

#Adjust for InvoiceDate
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d/%m/%Y %H:%M')
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Day'] = df['InvoiceDate'].dt.day
df['DayOfWeek'] = df['InvoiceDate'].dt.strftime('%A')
df['YearMonth'] = df['InvoiceDate'].dt.strftime('%Y-%m')

#Data Cleaning
df.drop_duplicates(inplace=True)
df.dropna(subset=['CustomerID'], inplace=True)
df = df[df['StockCode'] != "M"]

df = df.dropna()
df = df[df['Quantity'] > 0]
df.reset_index(drop=True, inplace=True)

df = df[df['UnitPrice'] > 0]
df.reset_index(drop=True, inplace=True)

#Remove Outliers by using Z-Score
z_Score = np.abs(stats.zscore(df[['Quantity', 'UnitPrice', 'TotalSales']]))
threshold = 3
df = df[(z_Score < threshold).all(axis=1)]

#Normalize continuous data
numerical_Columns = ['Quantity', 'UnitPrice', 'Year']
x_normalized = (df[numerical_Columns] - df[numerical_Columns].min()) / (df[numerical_Columns].max() - df[numerical_Columns].min())
df['Quantity'] = x_normalized['Quantity']
df['UnitPrice'] = x_normalized['UnitPrice']

#Data Selection
y = df.TotalSales.values
x_Data = df.drop(['InvoiceNo', 'InvoiceDate', 'Day', 'DayOfWeek', 'YearMonth', 'TotalSales'], axis=1)

#Find the last purchase date
last_date = df['InvoiceDate'].max()

#Calculate the recency, frequency and monetary values (RFM)
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (last_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalSales': 'sum'
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

#Save the column names
column_names = x_Data.columns
with open('column_names.pkl', 'wb') as file:
    pickle.dump(column_names, file)

#Group the stockCode and description
stockcode_description = df.groupby('StockCode')['Description'].unique().to_dict()

#Save the grouping for stockCode and description
with open('stockcode_description.pkl', 'wb') as file:
    pickle.dump(stockcode_description, file)

#Save the country list
with open('country_list.pkl', 'wb') as file:
    pickle.dump(df['Country'].unique(), file)

#Encoding the categorical data
le_stockcode = LabelEncoder()
le_description = LabelEncoder()
le_customerid = LabelEncoder()
le_country = LabelEncoder()

x_Data['StockCode'] = le_stockcode.fit_transform(x_Data['StockCode'])
x_Data['Description'] = le_description.fit_transform(x_Data['Description'])
x_Data['CustomerID'] = le_customerid.fit_transform(x_Data['CustomerID'])
x_Data['Country'] = le_country.fit_transform(x_Data['Country'])

#Save the LabelEncoders for stockCode, description, customerID and country
with open('label_encoder_stockcode.pkl', 'wb') as file:
    pickle.dump(le_stockcode, file)

with open('label_encoder_description.pkl', 'wb') as file:
    pickle.dump(le_description, file)

with open('label_encoder_customerid.pkl', 'wb') as file:
    pickle.dump(le_customerid, file)

with open('label_encoder_country.pkl', 'wb') as file:
    pickle.dump(le_country, file)

#Train Test Split
x_train, x_test, y_train, y_test = train_test_split(x_Data, y, test_size=0.2, random_state=0)

#Data Modeling using Random Forest Algorithm
#We use the best estimator that we found in testing for data modeling
parameter_grid = {
    'n_estimators': [300],
    'max_depth': [20],
    'min_samples_split': [2],
    'max_features': [None]
}

#Train the model
model = GridSearchCV(RandomForestRegressor(random_state=0), parameter_grid, cv=5, n_jobs=-1)
model.fit(x_train, y_train)

#Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

#Select the features for customer segmentation clustering
X = rfm[['Recency', 'Frequency', 'Monetary']]

#Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)
kmeans.fit(X_scaled)

#Save the Customer Segmentation model
with open('customer_segmentation.pkl', 'wb') as file:
    pickle.dump(kmeans, file)

#Save the Scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
