<a href="https://colab.research.google.com/github/kunalnischal7/CustomerChurnPrediction/blob/main/Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

## Import Libraries

In [1]:
import pandas as pd

## Import DataSet from google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd "/content/drive/My Drive/"

/content/drive/My Drive


## Load Data

In [4]:
file_path = 'Online Retail.xlsx'

In [5]:
df = pd.read_excel(file_path)

# Data Preparation

## Data Shuffling

In [6]:
from sklearn.utils import shuffle
df = shuffle(df, random_state = 44)

In [7]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

## Dropping Rows with Null Values

In [8]:
df.dropna(subset=['CustomerID'], inplace=True)

In [9]:
df.dropna(subset=['Description'], inplace=True)

In [10]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
534132,581133,22510,GINGHAM BABUSHKA DOORSTOP,8,2011-12-07 12:55:00,0.79,14904.0,United Kingdom
346910,567211,21931,JUMBO STORAGE BAG SUKI,2,2011-09-19 11:02:00,2.08,14159.0,United Kingdom
388159,570419,23483,HANGING BUTTERFLY T-LIGHT HOLDER,12,2011-10-10 13:33:00,1.25,15608.0,United Kingdom
134017,547817,22663,JUMBO BAG DOLLY GIRL DESIGN,3,2011-03-25 14:34:00,1.95,17946.0,United Kingdom
364256,568654,22577,WOODEN HEART CHRISTMAS SCANDINAVIAN,12,2011-09-28 12:20:00,0.85,14911.0,EIRE
...,...,...,...,...,...,...,...,...
461802,575952,22488,NATURAL SLATE RECTANGLE CHALKBOARD,2,2011-11-13 11:55:00,1.65,16015.0,United Kingdom
397396,571183,22951,60 CAKE CASES DOLLY GIRL DESIGN,2,2011-10-14 11:08:00,0.55,14796.0,United Kingdom
49723,540528,21448,12 DAISY PEGS IN WOOD BOX,2,2011-01-09 13:50:00,1.65,17800.0,United Kingdom
156845,550178,84569D,PACK 6 HEART/ICE-CREAM PATCHES,12,2011-04-14 17:25:00,1.25,12662.0,Germany


In [11]:
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

## Dropping Rows with Duplicate Values

In [12]:
duplicate_rows = df[df.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

Duplicate Rows:
       InvoiceNo StockCode                         Description  Quantity  \
456051    575668     21790                  VINTAGE SNAP CARDS         1   
534739    581166     23240    SET OF 4 KNICK KNACK TINS DOILY          1   
428151    573414     22633              HAND WARMER UNION JACK         1   
325303    565434     23326       HANGING MINI COLOURED BOTTLES         1   
469299    576592     21533            RETROSPOT LARGE MILK JUG         1   
...          ...       ...                                 ...       ...   
489537    578017     23101       SILVER STARS TABLE DECORATION         1   
312126    564342     23247          BISCUIT TIN 50'S CHRISTMAS         1   
506319    579135     20983  12 PENCILS TALL TUBE RED RETROSPOT         2   
491808    578074     21328              BALLOONS  WRITING SET          1   
158746    550312     22460      EMBOSSED GLASS TEALIGHT HOLDER        12   

               InvoiceDate  UnitPrice  CustomerID         Country  
456

In [13]:
#Dropping Duplicate Values
df = df.drop_duplicates()

## Checking for Outliers

In [14]:
#Checking for outliers in the data
from scipy import stats

z_scores = stats.zscore(df['Quantity'])
outliers = (z_scores > 3) | (z_scores < -3)
outlier_rows = df[outliers]
print("Outlier Rows:")
print(outlier_rows)

Outlier Rows:
       InvoiceNo StockCode                         Description  Quantity  \
80983     543099     22985        WRAP, BILLBOARD FONTS DESIGN       800   
52711     540815     21108  FAIRY CAKE FLANNEL ASSORTED COLOUR      3114   
490502    578060         M                              Manual      1600   
230315    557135     22616          PACK OF 12 LONDON TISSUES        864   
166426    550917     84077   WORLD WAR 2 GLIDERS ASSTD DESIGNS      1008   
...          ...       ...                                 ...       ...   
63444     541570     21212     PACK OF 72 RETROSPOT CAKE CASES      1440   
250766    559047     15036           ASSORTED COLOURS SILK FAN      1200   
380505    569815    85099B             JUMBO BAG RED RETROSPOT      1000   
201149    554272     21977  PACK OF 60 PINK PAISLEY CAKE CASES      2700   
4850      536809     84950      ASSORTED COLOUR T-LIGHT HOLDER      1824   

               InvoiceDate  UnitPrice  CustomerID         Country  
80983

## Removing the Outliers

In [15]:
#Removing Outliers from the dataset
df = df[~outliers]

# Model Building

In [16]:
!pip install scikit-surprise



In [17]:
from surprise import Dataset, Reader, SVD

In [18]:
from surprise.model_selection import train_test_split

In [19]:
reader = Reader(rating_scale=(0,df['Quantity'].max()))
data = Dataset.load_from_df(df[['CustomerID', 'Description', 'Quantity']], reader)

## Train Test Split

In [20]:
#Train Test 80/20 Split
trainset, testset = train_test_split(data, test_size=0.2, random_state=44)

## Training the model

In [21]:
#Using SVD Values for predicting
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x78fc64e3cdf0>

# Evaluation

In [22]:
predictions = model.test(testset)

In [23]:
from surprise import accuracy

In [24]:
#Using rmse and mae to evaluate the model
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
rmse
mae

RMSE: 748.9977
MAE:  748.3779


748.377871057745

### Hyperparameter Tuning for better results

In [25]:
from surprise.model_selection import GridSearchCV

In [26]:
param_grid = {
    'n_factors': [10, 20, 30],
    'n_epochs': [10, 20, 30],
    'lr_all': [0.001, 0.005, 0.01],
    'reg_all': [0.02, 0.1, 0.2]
}

In [27]:
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

In [28]:
grid_search.fit(data)

In [29]:
best_params = grid_search.best_params['rmse']

In [30]:
best_model = SVD(**best_params)
best_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x78fc72515570>

In [31]:
predictions = best_model.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 23.9344
MAE:  7.3424


#User Story

### Getting Customer Input

In [37]:
customer_id = int(input("Enter your Customer ID: "))
user_history = df[df['CustomerID'] == customer_id]
recommendations = user_history['StockCode'].value_counts().head(5).index.tolist()

Enter your Customer ID: 17850


## Recommendations

In [38]:
print("Recommended Items:")
for stock_code in recommendations:
    item_info = df[df['StockCode'] == stock_code].iloc[0]
    print(f"Item: {item_info['Description']}")
    print(f"Price: {item_info['UnitPrice']}")

Recommended Items:
Item: GLASS STAR FROSTED T-LIGHT HOLDER
Price: 4.95
Item: HAND WARMER RED RETROSPOT
Price: 2.1
Item: WHITE METAL LANTERN
Price: 3.75
Item: KNITTED UNION FLAG HOT WATER BOTTLE
Price: 4.25
Item: WOODEN FRAME ANTIQUE WHITE 
Price: 2.95
