# Recommender System with Python and SQLite

Collaborative filtering 

When there are no information on the user -> most popular items for the item category

Can run recommender system once a week

In [1]:
import sqlite3
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import scipy
import implicit
import random

pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 10)

### 1. Data cleaning and exploration

In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'

data = pd.read_excel(url)

data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [None]:
data.info()

# There are null values in CustomerID. We need to know who made the purchase to build for collaborative filtering

In [3]:
data = data[pd.notna(data['CustomerID'])]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      406829 non-null object
StockCode      406829 non-null object
Description    406829 non-null object
Quantity       406829 non-null int64
InvoiceDate    406829 non-null datetime64[ns]
UnitPrice      406829 non-null float64
CustomerID     406829 non-null float64
Country        406829 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [4]:
items = data.groupby(['StockCode','Description'], as_index=False).agg({'Quantity': np.sum})
items = items.sort_values(by = 'Quantity', ascending = False)
items.head()

Unnamed: 0,StockCode,Description,Quantity
2712,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,53215
3586,85099B,JUMBO BAG RED RETROSPOT,45066
2818,84879,ASSORTED COLOUR BIRD ORNAMENT,35314
3593,85123A,WHITE HANGING HEART T-LIGHT HOLDER,34147
361,21212,PACK OF 72 RETROSPOT CAKE CASES,33409


In [5]:
items.tail()

# More abnormalities in the data. We need to remove the items with negative quantity

Unnamed: 0,StockCode,Description,Quantity
314,21144,PINK POODLE HANGING DECORATION,-12
3910,CRUK,CRUK Commission,-16
629,21645,ASSORTED TUTTI FRUTTI ROUND BOX,-24
3911,D,Discount,-1194
2723,84347,ROTATING SILVER ANGELS T-LIGHT HLDR,-1460


In [6]:
data = data[data['Quantity'] > 0]
data['CustomerID'] = pd.to_numeric(data['CustomerID'], downcast = 'integer')

# Remove administrative charges such as postage or carriage

data['StockCodeLen'] = data['StockCode'].map(str).apply(len)
data = data[data['StockCodeLen'] > 4].drop('StockCodeLen', axis = 1)
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


### 2. Import data to SQLite

In [3]:
conn = sqlite3.connect('Database/ecommerce.sqlite')
cur = conn.cursor()

In [None]:
cur.execute("""
            CREATE TABLE IF NOT EXISTS invoice (
            ID INTEGER PRIMARY KEY AUTOINCREMENT, 
            InvoiceNo INTEGER,
            StockCode VARCHAR NOT NULL,
            Description VARCHAR,
            Quantity INTEGER,
            InvoiceDate VARCHAR,
            UnitPrice FLOAT,
            CustomerID INTEGER NOT NULL,
            Country VARCHAR
            )
            """)

for i in range(len(data)):
    cur.execute("""
                INSERT INTO invoice(
                                    InvoiceNo,
                                    StockCode,
                                    Description,
                                    Quantity,
                                    InvoiceDate,
                                    UnitPrice,
                                    CustomerID,
                                    Country)
                
                VALUES (?,?,?,?,?,?,?,?)
                """,
                (int(data.iloc[i][0]), str(data.iloc[i][1]), str(data.iloc[i][2]), int(data.iloc[i][3]), 
                 str(data.iloc[i][4]), float(data.iloc[i][5]), int(data.iloc[i][6]), str(data.iloc[i][7])))

conn.commit()

# In most cases, this step is not need because the original data is already in a database

### 2. Calculating sparsity

In order for collaborative filtering to work, we need to make sure the sparsity level does not go over 99.5%

In [7]:
# The score will be based on how many times a customer has purchased an item converted to 1-5 scale

recData = data.groupby(['CustomerID','StockCode','Description'], as_index = False).agg({'InvoiceNo': 'nunique'})

recData.rename(columns = {'InvoiceNo':'score'}, inplace = True)

In [8]:
pivotData = pd.pivot_table(recData, values = 'score', index = 'CustomerID', 
                            columns = 'Description', aggfunc = np.sum)
masked = np.ma.masked_invalid(pivotData)
mc = masked.compressed()
print(1 - mc.shape[0]/np.prod(pivotData.shape))

# The maximum recommended sparsity is 99.5%

0.9841981993871773


### 3. Collaborative filtering

In [9]:
# Create a sparse matrix

recData['CustomerID'] = recData['CustomerID'].astype("category")
recData['StockCode'] = recData['StockCode'].astype("category")
recData['user'] = recData['CustomerID'].cat.codes
recData['item'] = recData['StockCode'].cat.codes

# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparseItemUser = scipy.sparse.csr_matrix((rec_data['score'].astype(float), (rec_data['item'], rec_data['user'])))
sparseUserItem = scipy.sparse.csr_matrix((rec_data['score'].astype(float), (rec_data['user'], rec_data['item'])))

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

# Calculate the confidence by multiplying it by alpha value, based on the methodologies of 
# http://yifanhu.net/PUB/cf.pdf
alphaVal = 15
dataConf = (sparseItemUser * alphaVal).astype('double')

#Fit the model
model.fit(data_conf)


100%|██████████| 50.0/50 [00:04<00:00, 11.06it/s]


In [65]:
# Helper functions

def getUserItems(cid):
    return recData[recData['user'] == cid]['Description'].astype('str').unique()
def getItemDescription(items):
    items = list(items)
    descriptions = []
    for item in items:
        descriptions.extend(recData[recData['item'] == item ]['Description'].astype('str'))
    return descriptions

def getUserRecommendations(cid):
    itemRecs = {}
    recommended = model.recommend(6, sparseUserItem)
    for item in recommended:
        itemRecs[item[0]] = item[1]
    return itemRecs

def getSimilarItems(item):
    similar = model.similar_items(item, 11)
    similarItems = {}
    for item in similar:
        similarItems[item[0]] = item[1]
    return similarItems



    
    
    


In [57]:
print("User purchased:", getUserItems(6))

print("Recomendations:", getItemDescription(getUserRecommendations(6).keys()))

User purchased: ['NOVELTY BISCUITS CAKE STAND 3 TIER' 'MINI CAKE STAND WITH HANGING CAKES'
 'CERAMIC CAKE STAND + HANGING CAKES' 'CERAMIC CAKE BOWL + HANGING CAKES']
Recomendations: ['MINI CAKE STAND  HANGING STRAWBERY', 'LARGE CAKE STAND  HANGING STRAWBERY', 'STRAWBERRY FAIRY CAKE TEAPOT', 'SWEETHEART CERAMIC TRINKET BOX', 'CERAMIC BOWL WITH STRAWBERRY DESIGN', 'CERAMIC CAKE DESIGN SPOTTED MUG', 'STRAWBERRY CERAMIC TRINKET POT', 'STRAWBERRY CERAMIC TRINKET BOX', 'CERAMIC STRAWBERRY CAKE MONEY BANK', 'CERAMIC STRAWBERRY DESIGN MUG', 'CERAMIC CAKE DESIGN SPOTTED PLATE']


In [67]:
print(getItem(6))

print("Recomendations:", getItemDescription(getSimilarItems(6).keys()))

['ASSTD DESIGN RACING CAR PEN']
None
Recomendations: ['ASSTD DESIGN RACING CAR PEN', 'CAMOUFLAGE LED TORCH', '3D DOG PICTURE PLAYING CARDS', '3D SHEET OF DOG STICKERS', 'NINJA RABBIT BLACK', 'BALLOON WATER BOMB PACK OF 35', 'NINJA RABBIT PINK', 'COLUMBIAN CANDLE RECTANGLE', 'CAMOUFLAGE EAR MUFF HEADPHONES', 'SILVER DIAMANTE PEN IN GIFT BOX', 'EIGHT PIECE SNAKE  SET']
