In [1]:
import pandas as pd
import numpy as np
retail_dataframe = pd.read_excel("Online Retail.xlsx")
retail_dataframe.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


# Build the model for Germany

In [2]:
germany_retail_df = retail_dataframe[retail_dataframe['Country']=='Germany']

# Data cleaning and Data Preprocessing

1. Fill missing Description with default description title
2. Fill missing CustomerID with default customer ID (say 99999)
3. Convert CustomerID column to integer
4. Remove 'POST' StockCode from the dataset, it represents a POSTAGE and doesnt relate to a product
5. Filter cancelled transactions (InvoiceNo start with 'C')
6. Replace the outliars for Quantity, replace the 97 percentile value with the mean of quantity
7. Convert StockCode to string and remove spaces
8. Drop labels 'InvoiceNo', 'Country', 'InvoiceDate', 'Description' and 'UnitPrice'
9. Adjust the columns, move the CustomerID to the start of the dataset
10. Create a dataframe with the Customer, StockCode and Quantity.

In [3]:
def dataCleaning(germany_retail_df):
    germany_retail_df['CustomerID'].fillna(99999.0, inplace=True)
    germany_retail_df['Description'].fillna('Desc Missing', inplace=True)
    germany_retail_df['StockCode'] = germany_retail_df['StockCode'].astype(str)
    germany_retail_df['CustomerID']=germany_retail_df['CustomerID'].apply(np.int64) 
 
    germany_retail_df.StockCode = germany_retail_df.StockCode.str.replace(' ', '')
    german_cancelled_trans_df = germany_retail_df[germany_retail_df['InvoiceNo'].astype('str').str.startswith('C', na=False)]
    
    #Remove POSTAGE item 
    germany_retail_df = germany_retail_df.drop(germany_retail_df[germany_retail_df['StockCode'] == 'POST'].index, axis=0)
    
    return germany_retail_df

In [4]:
def getSaleData(german_dataset_df):
    #Remove Cancelled transcations from the dataset
    german_cancelled_trans_df = german_dataset_df[german_dataset_df['InvoiceNo'].astype('str').str.startswith('C', na=False)]
    german_sale_trans_df = german_dataset_df[~german_dataset_df['InvoiceNo'].isin(german_cancelled_trans_df.InvoiceNo)]
    return german_sale_trans_df

In [5]:
def dataPreprocessing(german_sale_trans_df):
    #Replace Quantity of items having qty > 299 with the mean()
    german_sale_trans_df['Quantity']=np.where(german_sale_trans_df['Quantity'] > 299, german_sale_trans_df.Quantity.mean(), german_sale_trans_df['Quantity'])
    
    german_sale_trans_df.drop(labels=['InvoiceNo','Country', 'InvoiceDate', 'Description', 'UnitPrice'], axis=1, inplace=True)
    front = german_sale_trans_df['CustomerID']
    german_sale_trans_df.drop(labels=['CustomerID'], axis=1,inplace = True)
    german_sale_trans_df.insert(0, 'CustomerID', front)

    return german_sale_trans_df

In [6]:
def createStockDescMapperDF(german_sale_trans_df):
    german_stock_desc_df = german_sale_trans_df[['StockCode', 'Description']]
    german_stock_desc_df.reset_index(inplace=True)
    german_stock_desc_df.drop(['index'], axis=1, inplace=True)

    test=german_stock_desc_df.groupby(['StockCode','Description']).count()
    german_stock_desc_df = test.reset_index()
    german_stock_desc_df.drop_duplicates(subset='StockCode', keep="first", inplace=True)
    german_stock_desc_df.reset_index(inplace=True)
    german_stock_desc_df.drop('index',axis=1, inplace=True)
    return german_stock_desc_df

In [7]:
def createCustomerStockQtyMapperDF(german_sale_trans_df):
    german_sale_trans_group_df = german_sale_trans_df.groupby(['CustomerID', 'StockCode']).sum().sort_values(by='Quantity', ascending=False)[['Quantity']]
    german_sale_trans_group_df.reset_index(inplace=True)
    return german_sale_trans_group_df

In [8]:
germany_retail_df = dataCleaning(germany_retail_df)
german_sale_trans_df = getSaleData(germany_retail_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus

In [9]:
german_sale_trans_df.groupby('Quantity').count().reset_index().sort_values(by='Quantity', ascending=False).head(10)

Unnamed: 0,Quantity,InvoiceNo,StockCode,Description,InvoiceDate,UnitPrice,CustomerID,Country
46,600,1,1,1,1,1,1,1
45,432,1,1,1,1,1,1,1
44,300,1,1,1,1,1,1,1
43,288,4,4,4,4,4,4,4
42,200,2,2,2,2,2,2,2
41,192,2,2,2,2,2,2,2
40,144,16,16,16,16,16,16,16
39,128,9,9,9,9,9,9,9
38,125,5,5,5,5,5,5,5
37,120,8,8,8,8,8,8,8


In [10]:
german_sale_trans_df.Quantity.describe()

count    8668.000000
mean       13.630480
std        17.880032
min         1.000000
25%         6.000000
50%        12.000000
75%        12.000000
max       600.000000
Name: Quantity, dtype: float64

In [11]:
#Create a copy of the sale transaction dataset
german_sale_trans_df_orig = german_sale_trans_df.copy()

In [12]:
german_sale_trans_df = dataPreprocessing(german_sale_trans_df)
german_stock_desc_df = createStockDescMapperDF(german_sale_trans_df_orig)
german_sale_trans_group_df = createCustomerStockQtyMapperDF(german_sale_trans_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [13]:
num_users = len(german_sale_trans_df.CustomerID.unique())
num_items = len(german_sale_trans_df.StockCode.unique())
print('There are {} unique users and {} unique products in this data set'.format(num_users, num_items))

There are 94 unique users and 1664 unique products in this data set


# Popular stockCodes

In [14]:
def popularGermanItems():
    purchase_g = german_sale_trans_df_orig.groupby(['StockCode'])['Quantity'].sum().reset_index()
    purchase_g_sorted = purchase_g.sort_values("Quantity", ascending=False).reset_index().head(10)
    top_selling_stockcodes = purchase_g_sorted['StockCode'].head(10).tolist()
    return top_selling_stockcodes

# Content based filterring

Content based filterring uses the 'Description' of the product using the term frequency vectorizer to make recommendations

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(german_stock_desc_df['Description'])
doc_term_matrix = tfidf_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=tfidf.get_feature_names(), 
                  index=german_stock_desc_df['Description'])

from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
from sklearn.metrics.pairwise import cosine_similarity
indices = pd.Series(german_stock_desc_df.index, index=german_stock_desc_df['Description'])

In [16]:
# Function that takes in product description as input and outputs most similar products
def get_recommendations_content(stockCode, cosine_sim=cosine_sim):
    #Get the product desc from the stockcode
    desc = german_stock_desc_df[german_stock_desc_df['StockCode'] == stockCode]['Description'].tolist()[0]
    
    # Get the index of the movie that matches the description
    idx = indices[desc]

    # Get the pairwsie similarity scores of all products with that product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar products
    sim_scores = sim_scores[1:11]

    # Get the product indices
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar products
    return german_stock_desc_df[german_stock_desc_df['StockCode'].isin(german_stock_desc_df['StockCode'].iloc[product_indices])]

In [17]:
stockCode = '23292'
try:
    print('Product Recomendations for ',stockCode, '-', german_sale_trans_df_orig[german_sale_trans_df_orig['StockCode'] == stockCode][['Description']][-1:]['Description'].iloc[0],'\n')    
    print(get_recommendations_content(stockCode).to_string(index=False))
except:
    print('StockCode Not Recognised !!', '\n')
    print('Take a look at popular products.....', '\n')
    printProducts(popularItems('Germany'))

Product Recomendations for  23292 - SPACEBOY CHILDRENS CUP 

StockCode                       Description
    21238                 RED RETROSPOT CUP
    21239                PINK  POLKADOT CUP
    21240                 BLUE POLKADOT CUP
    22367   CHILDRENS APRON SPACEBOY DESIGN
    22975        SPACEBOY CHILDRENS EGG CUP
    22976  CIRCUS PARADE CHILDRENS EGG CUP 
    22977      DOLLY GIRL CHILDRENS EGG CUP
    23256       CHILDRENS CUTLERY SPACEBOY 
    23290           SPACEBOY CHILDRENS BOWL
    23291          DOLLY GIRL CHILDRENS CUP


Invalid or empty product code will make the model recommend popular products

In [46]:
stockCode = ''
try:
    print('Product Recomendations for ',stockCode, '-', german_sale_trans_df_orig[german_sale_trans_df_orig['StockCode'] == stockCode][['Description']][-1:]['Description'].iloc[0],'\n')    
    print(get_recommendations_content(stockCode).to_string(index=False))
except:
    print('StockCode Not Recognised !!', '\n')
    print('Take a look at popular products.....', '\n')
    printProducts(popularGermanItems())

StockCode Not Recognised !! 

Take a look at popular products..... 

22326 - ROUND SNACK BOXES SET OF4 WOODLAND 
15036 - ASSORTED COLOURS SILK FAN
20719 - WOODLAND CHARLOTTE BAG
21212 - PACK OF 72 RETROSPOT CAKE CASES
22585 - PACK OF 6 BIRDY GIFT TAGS
22629 - SPACEBOY LUNCH BOX 
22554 - PLASTERS IN TIN WOODLAND ANIMALS
22961 - JAM MAKING SET PRINTED
22423 - REGENCY CAKESTAND 3 TIER
16045 - POPART WOODEN PENCILS ASST


# Collaborative based filterring

We will implement item based and user based collaborative filterring to reccoment products to the customer

In [18]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [19]:
df_matrix_item = pd.pivot_table(german_sale_trans_group_df, values='Quantity', columns='CustomerID', index='StockCode').fillna(0)
df_matrix_item.head()

CustomerID,12426,12427,12468,12471,12472,12473,12474,12475,12476,12477,...,12738,13810,13811,13812,13813,13814,13815,13816,13817,14335
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10125,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df_matrix_item.shape

(1664, 94)

In [21]:
df_matrix_item.loc[['10002']]

CustomerID,12426,12427,12468,12471,12472,12473,12474,12475,12476,12477,...,12738,13810,13811,13812,13813,13814,13815,13816,13817,14335
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df_matrix_user = pd.pivot_table(german_sale_trans_group_df, values='Quantity', columns='StockCode', index='CustomerID').fillna(0)
df_matrix_user.head()

StockCode,10002,10125,10135,11001,15034,15036,15039,15044A,15044B,15044D,...,90161D,90170,90173,90201A,90201B,90201C,90201D,90202D,90204,M
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df_matrix_user.shape

(94, 1664)

In [24]:
df_matrix_user.loc[[12426, 12468]]

StockCode,10002,10125,10135,11001,15034,15036,15039,15044A,15044B,15044D,...,90161D,90170,90173,90201A,90201B,90201C,90201D,90202D,90204,M
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
df_matrix_user.loc[[12426, 12468],'10002':'10135']

StockCode,10002,10125,10135
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12426,0.0,0.0,0.0
12468,0.0,0.0,0.0


In [26]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)

In [27]:
def make_recommendation_collab(model_knn, data, fav_product, index_value, n_recommendations):
    '''return top n similar product recommendations based on user's input product code
    Parameters
    ----------
    model_knn: sklearn model, knn model
    data: movie-user matrix
    fav_product: str, name of product code
    index_value: index value of the stockCode in the sparse matrix
    n_recommendations: int, top n recommendations
    Return
    ------
    list of top n similar product recommendations
    '''
    # fit
    model_knn.fit(data)
    distances, indices = model_knn.kneighbors(data[index_value], n_neighbors=n_recommendations+1)
    raw_recommends =         sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    return raw_recommends

In [28]:
def getStockCodeIndex(productCode):
    try:
        stockCodeIndex = temp_list.index(productCode)
        return stockCodeIndex
    except:
        return -1;

In [29]:
def getCustomerIDIndex(custID):
    try:
        custIDIndex = temp_user_list.index(custID)
        return custIDIndex
    except:
        return -1;

In [30]:
def productsToRecommend(recommends):
    product_list = []

    threshold = 10
    for i in range(threshold):
            product_list.append(temp_list[recommends[i][0]])
    return product_list

In [31]:
def similarCustomers(recommends):
    customer_list = []
    distance_list = []
    threshold = 10
    for i in range(threshold):
            customer_list.append(temp_user_list[recommends[i][0]])
            distance_list.append(recommends[i][1])
    
    customer_dist_dict = dict(zip(customer_list, distance_list))
    return customer_dist_dict

In [32]:
def filter_top10(top10_collab, top10_content):
    primary_matches = list(set(top10_collab).intersection(set(top10_content)))
    print('Common stockCode: ', primary_matches)
    secondary_matches = [ele for ele in top10_collab if ele not in primary_matches]
    return primary_matches + secondary_matches

In [33]:
def printProducts(productList):
    for productCode in productList:
        print(productCode, '-', german_sale_trans_df_orig[german_sale_trans_df_orig['StockCode'] == productCode][['Description']][-1:]['Description'].iloc[0])

# Item based

In [34]:
temp_list=df_matrix_item.index.tolist()
cust_item_features = csr_matrix(df_matrix_item.values)

In [35]:
model_knn.fit(cust_item_features)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

The model will recommend the top 10 similar items applying item based collaborative filterring and content based (based on the item descriptioon).

The model will 1st select top 10 similar items based on the cosine similarity of the items (collaborative filterring) and follows it up with the top 10 similar items based on the content based similarity. The model then compares the two sets of top 10 recommendations and prepares the final recommendation list which contains common items from the collaborative and content based approaches and uncommon items from collaborative filterring.

In [36]:
productCode = '10125'
#productCode = '23290'
#productCode = '23292'
#productCode = '21094'
#productCode = '21086'
#productCode=''
stockIndex = getStockCodeIndex(productCode)
if(stockIndex == -1):
    print('StockCode not Recognised!!!','\n')
    print('Take a look at popular products.....','\n')
    printProducts(popularGermanItems())
else:
    recommends = make_recommendation_collab(
    model_knn=model_knn,
    data=cust_item_features,
    fav_product=productCode,
    index_value=stockIndex,
    n_recommendations=10)
    
    top10product_collab_list = productsToRecommend(recommends)
    top10product_content = get_recommendations_content(stockCode)
    top10product_content_list = top10product_content['StockCode'].tolist()
    prod_to_recommend = filter_top10(top10product_collab_list, top10product_content_list)
    if(len(prod_to_recommend) >= 1):
        print('Product Recomendations for ',productCode, '-', german_sale_trans_df_orig[german_sale_trans_df_orig['StockCode'] == productCode][['Description']][-1:]['Description'].iloc[0])
        print('\n')
        printProducts(prod_to_recommend)
    else:
        print('No products to recommend','\n')
        print('Take a look at popular products.....', '\n')
        printProducts(popularGermanItems())

Common stockCode:  []
Product Recomendations for  10125 - MINI FUNKY DESIGN TAPES


22433 - WATERING CAN GREEN DINOSAUR
21884 - CAKES AND BOWS GIFT  TAPE
21883 - STARS GIFT TAPE 
16011 - ANIMAL STICKERS
22708 - WRAP DOLLY GIRL
22610 - PENS ASSORTED FUNNY FACE
22502 - PICNIC BASKET WICKER SMALL
22608 - PENS ASSORTED FUNKY JEWELED 
22609 - PENS ASSORTED SPACEBALL
21882 - SKULLS TAPE


Invalid or empty product code will make the model recommend popular products

In [37]:
#productCode = '10125'
#productCode = '23290'
#productCode = '23292'
#productCode = '21094'
#productCode = '21086'
productCode=''
stockIndex = getStockCodeIndex(productCode)
if(stockIndex == -1):
    print('StockCode not Recognised!!!','\n')
    print('Take a look at popular products.....','\n')
    printProducts(popularGermanItems())
else:
    recommends = make_recommendation_collab(
    model_knn=model_knn,
    data=cust_item_features,
    fav_product=productCode,
    index_value=stockIndex,
    n_recommendations=10)
    
    top10product_collab_list = productsToRecommend(recommends)
    top10product_content = get_recommendations_content(stockCode)
    top10product_content_list = top10product_content['StockCode'].tolist()
    prod_to_recommend = filter_top10(top10product_collab_list, top10product_content_list)
    if(len(prod_to_recommend) >= 1):
        print('Product Recomendations for ',productCode, '-', german_sale_trans_df_orig[german_sale_trans_df_orig['StockCode'] == productCode][['Description']][-1:]['Description'].iloc[0])
        print('\n')
        printProducts(prod_to_recommend)
    else:
        print('No products to recommend','\n')
        print('Take a look at popular products.....', '\n')
        printProducts(popularGermanItems())

StockCode not Recognised!!! 

Take a look at popular products..... 

22326 - ROUND SNACK BOXES SET OF4 WOODLAND 
15036 - ASSORTED COLOURS SILK FAN
20719 - WOODLAND CHARLOTTE BAG
21212 - PACK OF 72 RETROSPOT CAKE CASES
22585 - PACK OF 6 BIRDY GIFT TAGS
22629 - SPACEBOY LUNCH BOX 
22554 - PLASTERS IN TIN WOODLAND ANIMALS
22961 - JAM MAKING SET PRINTED
22423 - REGENCY CAKESTAND 3 TIER
16045 - POPART WOODEN PENCILS ASST


# User based

In [38]:
temp_user_list=df_matrix_user.index.tolist()
cust_user_features = csr_matrix(df_matrix_user.values)

In [39]:
model_knn.fit(cust_user_features)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

Customer ID 12468 has items which be recommended from more than one customer. The model will recommend items from the 1st nearest neighbour plus evaluates items from the 2nd nearest neighbour if the cosine distance is > 0.75.
The model will filter the top 10 items from the two neighbours and picks the common items plus the remaining items from the 1st nearest neighbour.

In [40]:
customerID = 12468 #Has an item from Customer 2 
#customerID = 12517
customerIDIndex = getCustomerIDIndex(customerID)
if(customerIDIndex == -1):
    print('Customer ID not Recognised!!!','\n')
    print('Take a look at popular products.....','\n')
    printProducts(popularGermanItems())
else:
    recommends = make_recommendation_collab(
    model_knn=model_knn,
    data=cust_user_features,
    fav_product=customerID,
    index_value=customerIDIndex,
    n_recommendations=10)
    
    similarCustomersDict = similarCustomers(recommends)
    print(similarCustomersDict)
    print('\n')
    dictEmpty = bool(similarCustomersDict)
    if(dictEmpty):
        most_similar_customer= list(similarCustomersDict.keys())[0]
        #Get top 10 items for this customer and we will recommend these to our target customer
        
        #Get the list of items (top 10) not purchased by target customer but most frequently purchased by similar customer 
        target_customer_item_list = german_sale_trans_group_df[german_sale_trans_group_df['CustomerID'] == customerID]['StockCode'].tolist()
        most_similar_cust_item_list = german_sale_trans_group_df[german_sale_trans_group_df['CustomerID'] == most_similar_customer]['StockCode'].tolist()

        items_not_purchased = [item for item in most_similar_cust_item_list if item not in target_customer_item_list] 
        
        cust_stock_item_df = german_sale_trans_group_df[(german_sale_trans_group_df['CustomerID'] == most_similar_customer) & (german_sale_trans_group_df['StockCode'].isin(items_not_purchased))].sort_values(by='Quantity', ascending=False)
        qty_mean = cust_stock_item_df['Quantity'].mean()
        
        #Get top 10 stockCodes sold more than the mean qty
        first_cust_top10_stockCodes = cust_stock_item_df[cust_stock_item_df['Quantity'] >= qty_mean]['StockCode'].head(10)#.to_string(index=False)
        print('1st Customer - Top 10 StockCodes')
        print(first_cust_top10_stockCodes.values)
        print('\n')
        #Check the next closest customer to the target customer and evaulated the item basket if the distance > .75
        try:
            if(list(similarCustomersDict)[1]):
                second_closest_customer = list(similarCustomersDict)[1]    
                distance = similarCustomersDict[second_closest_customer]
                if(distance > 0.75):
                    print('2nd Customer - Top 10 StockCodes')
                    second_most_similar_cust_item_list = german_sale_trans_group_df[german_sale_trans_group_df['CustomerID'] == second_closest_customer]['StockCode'].tolist()
                    second_cust_items_not_purchased = [item for item in second_most_similar_cust_item_list if item not in target_customer_item_list] 
                    second_cust_stock_item_df = german_sale_trans_group_df[(german_sale_trans_group_df['CustomerID'] == second_closest_customer) & (german_sale_trans_group_df['StockCode'].isin(second_cust_items_not_purchased))].sort_values(by='Quantity', ascending=False)
                    second_cust_qty_mean = second_cust_stock_item_df['Quantity'].mean()
                    
                    second_cust_top10_stockCodes = second_cust_stock_item_df[second_cust_stock_item_df['Quantity'] > second_cust_qty_mean]['StockCode'].head(10)#.to_string(index=False)
                    print(second_cust_top10_stockCodes.values)
                    print('\n')
                    prod_to_recommend = filter_top10(first_cust_top10_stockCodes, second_cust_top10_stockCodes)
                    if(len(prod_to_recommend) >= 1):
                        print('Product Recomendations for Customer - ',customerID)
                              
                        print('\n')
                        printProducts(prod_to_recommend)
                    else:
                        print('No products to recommend','\n')
                        print('Take a look at popular products.....', '\n')
                        printProducts(popularGermanItems())
        except IndexError:
            print('Invalid')
            
    else:
        print('No products to recommend','\n')
        print('Take a look at popular products.....', '\n')
        printProducts(popularGermanItems())

{13817: 0.8017804874263846, 12518: 0.7915542143349228, 12481: 0.7899492861238996, 12472: 0.7477686722481475, 12712: 0.7427780263723879, 12613: 0.698914685504644, 12474: 0.6798952512896485, 12705: 0.6572407385172263, 13816: 0.6461363521971923, 12645: 0.5578316115615872}


1st Customer - Top 10 StockCodes
['20712' '22333' '22555' '23433' '22551' '23475' '23474' '20981']


2nd Customer - Top 10 StockCodes
['22705' '22708' '22534' '22037' '22551' '22029' '23290' '23084' '16169E'
 '22544']


Common stockCode:  ['22551']
Product Recomendations for Customer -  12468


22551 - PLASTERS IN TIN SPACEBOY
20712 - JUMBO BAG WOODLAND ANIMALS
22333 - RETROSPOT PARTY BAG + STICKER SET
22555 - PLASTERS IN TIN STRONGMAN
23433 - HANGING QUILTED PATCHWORK APPLES
23475 - WOODLAND SMALL PINK FELT HEART
23474 - WOODLAND SMALL BLUE FELT HEART
20981 - 12 PENCILS TALL TUBE WOODLAND


Invalid or Empty Customer ID will recommend Popular StockCodes

In [42]:
#customerID = 12468 #Has an item from Customer 2 
customerID = ''
customerIDIndex = getCustomerIDIndex(customerID)
if(customerIDIndex == -1):
    print('Customer ID not Recognised!!!','\n')
    print('Take a look at popular products.....','\n')
    printProducts(popularGermanItems())
else:
    recommends = make_recommendation_collab(
    model_knn=model_knn,
    data=cust_user_features,
    fav_product=customerID,
    index_value=customerIDIndex,
    n_recommendations=10)
    
    similarCustomersDict = similarCustomers(recommends)
    print(similarCustomersDict)
    print('\n')
    dictEmpty = bool(similarCustomersDict)
    if(dictEmpty):
        most_similar_customer= list(similarCustomersDict.keys())[0]
        #Get top 10 items for this customer and we will recommend these to our target customer
        
        #Get the list of items (top 10) not purchased by target customer but most frequently purchased by similar customer 
        target_customer_item_list = german_sale_trans_group_df[german_sale_trans_group_df['CustomerID'] == customerID]['StockCode'].tolist()
        most_similar_cust_item_list = german_sale_trans_group_df[german_sale_trans_group_df['CustomerID'] == most_similar_customer]['StockCode'].tolist()

        items_not_purchased = [item for item in most_similar_cust_item_list if item not in target_customer_item_list] 
        
        cust_stock_item_df = german_sale_trans_group_df[(german_sale_trans_group_df['CustomerID'] == most_similar_customer) & (german_sale_trans_group_df['StockCode'].isin(items_not_purchased))].sort_values(by='Quantity', ascending=False)
        qty_mean = cust_stock_item_df['Quantity'].mean()
        
        #Get top 10 stockCodes sold more than the mean qty
        first_cust_top10_stockCodes = cust_stock_item_df[cust_stock_item_df['Quantity'] >= qty_mean]['StockCode'].head(10)#.to_string(index=False)
        print('1st Customer - Top 10 StockCodes')
        print(first_cust_top10_stockCodes.values)
        print('\n')
        #Check the next closest customer to the target customer and evaulated the item basket if the distance > .75
        try:
            if(list(similarCustomersDict)[1]):
                second_closest_customer = list(similarCustomersDict)[1]    
                distance = similarCustomersDict[second_closest_customer]
                if(distance > 0.75):
                    print('2nd Customer - Top 10 StockCodes')
                    second_most_similar_cust_item_list = german_sale_trans_group_df[german_sale_trans_group_df['CustomerID'] == second_closest_customer]['StockCode'].tolist()
                    second_cust_items_not_purchased = [item for item in second_most_similar_cust_item_list if item not in target_customer_item_list] 
                    second_cust_stock_item_df = german_sale_trans_group_df[(german_sale_trans_group_df['CustomerID'] == second_closest_customer) & (german_sale_trans_group_df['StockCode'].isin(second_cust_items_not_purchased))].sort_values(by='Quantity', ascending=False)
                    second_cust_qty_mean = second_cust_stock_item_df['Quantity'].mean()
                    
                    second_cust_top10_stockCodes = second_cust_stock_item_df[second_cust_stock_item_df['Quantity'] > second_cust_qty_mean]['StockCode'].head(10)#.to_string(index=False)
                    print(second_cust_top10_stockCodes.values)
                    print('\n')
                    prod_to_recommend = filter_top10(first_cust_top10_stockCodes, second_cust_top10_stockCodes)
                    if(len(prod_to_recommend) >= 1):
                        print('Product Recomendations for Customer - ',customerID)
                              
                        print('\n')
                        printProducts(prod_to_recommend)
                    else:
                        print('No products to recommend','\n')
                        print('Take a look at popular products.....', '\n')
                        printProducts(popularGermanItems())
        except IndexError:
            print('Invalid')
            
    else:
        print('No products to recommend','\n')
        print('Take a look at popular products.....', '\n')
        printProducts(popularGermanItems())

Customer ID not Recognised!!! 

Take a look at popular products..... 

22326 - ROUND SNACK BOXES SET OF4 WOODLAND 
15036 - ASSORTED COLOURS SILK FAN
20719 - WOODLAND CHARLOTTE BAG
21212 - PACK OF 72 RETROSPOT CAKE CASES
22585 - PACK OF 6 BIRDY GIFT TAGS
22629 - SPACEBOY LUNCH BOX 
22554 - PLASTERS IN TIN WOODLAND ANIMALS
22961 - JAM MAKING SET PRINTED
22423 - REGENCY CAKESTAND 3 TIER
16045 - POPART WOODEN PENCILS ASST
