In [131]:
import warnings
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
warnings.filterwarnings("ignore")

In [140]:
df = pd.read_csv('new.csv', index_col=0)
df = df.dropna()

In [141]:
df.head()

Unnamed: 0,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,City,State,Country,...,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Processing_Time_days,Returned,Income_Level,Gender
0,CA-2014-AB10015140-41954,2014-11-11,2014-11-13,First Class,AB-100151402,Aaron Bergman,Consumer,Oklahoma City,Oklahoma,United States,...,221.98,2,0.0,62.1544,40.77,High,2,0,High income,0
1,IN-2014-JR162107-41675,2014-02-05,2014-02-07,Second Class,JR-162107,Justin Ritter,Corporate,Wollongong,New South Wales,Australia,...,3709.395,9,0.1,-288.765,923.63,Critical,2,0,High income,0
2,IN-2014-CR127307-41929,2014-10-17,2014-10-18,First Class,CR-127307,Craig Reiter,Consumer,Brisbane,Queensland,Australia,...,5175.171,9,0.1,919.971,915.49,Medium,1,0,High income,0
3,ES-2014-KM1637548-41667,2014-01-28,2014-01-30,First Class,KM-1637548,Katherine Murray,Home Office,Berlin,Berlin,Germany,...,2892.51,5,0.1,-96.54,910.16,Medium,2,0,High income,1
4,SG-2014-RH9495111-41948,2014-11-05,2014-11-06,Same Day,RH-9495111,Rick Hansen,Consumer,Dakar,Dakar,Senegal,...,2832.96,8,0.0,311.52,903.04,Critical,1,0,Low income,0


In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51213 entries, 0 to 51289
Data columns (total 26 columns):
Order_ID                51213 non-null object
Order_Date              51213 non-null object
Ship_Date               51213 non-null object
Ship_Mode               51213 non-null object
Customer_ID             51213 non-null object
Customer_Name           51213 non-null object
Segment                 51213 non-null object
City                    51213 non-null object
State                   51213 non-null object
Country                 51213 non-null object
Region                  51213 non-null object
Market                  51213 non-null object
Product_ID              51213 non-null object
Category                51213 non-null object
Sub-Category            51213 non-null object
Product_Name            51213 non-null object
Sales                   51213 non-null float64
Quantity                51213 non-null int64
Discount                51213 non-null float64
Profit          

In [101]:
# Product_ID
purchased = df[['Customer_ID', 'Product_ID']]
purchased['Count'] = 1
Customer_ID = list((purchased.Customer_ID.unique()))
Product_ID = list((purchased.Product_ID.unique()))

data = purchased['Count'].tolist()
row = purchased.Customer_ID.astype('category', categories=Customer_ID).cat.codes
col = purchased.Product_ID.astype('category', categories=Product_ID).cat.codes
sparse_matrix = csr_matrix((data, (row, col)), shape=(len(Customer_ID), len(Product_ID)))

In [105]:
purchased_per_customer = pd.SparseDataFrame([pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0) for i in np.arange(sparse_matrix.shape[0])], index=Customer_ID, columns=Product_ID, default_fill_value=0)

In [147]:
# Demographic data
demog = df[['Customer_ID', 'Gender', 'Segment', 'Market', 'Income_Level']]
demog['Segment'] = [s.replace('Consumer', '1').replace('Corporate', '2').replace('Home Office', '3') for s in demog.Segment]
demog['Market'] = [m.replace('USCA', '1').replace('Asia Pacific', '2').replace('Europe', '3').replace('Africa', '4').replace('LATAM', '5') for m in demog.Market]
demog['Income_Level'] = [i.replace('High income', '1').replace('Low income', '2').replace('Upper middle income', '3').replace('Lower middle income', '4') for i in demog.Income_Level]
demog_by_customer = demog.groupby('Customer_ID').first()


In [152]:
new_df = pd.concat([demog_by_customer, purchased_per_customer], axis=1, join='inner')

In [181]:
cosine = pd.DataFrame(cosine_similarity(new_df))
cosine.columns = new_df.index
cosine.index = new_df.index

In [236]:
similarity = cosine.iloc[0][cosine.iloc[0] < 1].sort_values(ascending=False)

In [369]:
similarity[:10]

BO-11425102    0.902671
HL-5040134     0.894427
RP-1985558     0.885270
KC-16675144    0.868079
PG-18895130    0.866025
JR-57009       0.866025
CG-252060      0.866025
DB-13405130    0.866025
AH-1046527     0.866025
TB-2140058     0.861640
Name: AA-10315102, dtype: float64

In [260]:
# Find top 10 users
top_10 = similarity.index[:10]

In [323]:
# Check top 1 user first
top_1 = similarity.index[0]
purchased_history = df[df.Customer_ID == top_1]['Product_ID'].values
print(top_1, purchased_history)

BO-11425102 ['OFF-SU-4327']


In [412]:
# Check what have they purchased
top_10_purchase_history = []
recommend_items = []
for user in top_10:
    purchased_history = df[df.Customer_ID == user]['Product_ID'].values
    for item in purchased_history:
        if item not in top_10_purchase_history:
            top_10_purchase_history.append(item)

purchased_history_target = df[df.Customer_ID == similarity.name]['Product_ID'].values
for item_target in purchased_history_target:
    if item_target not in top_10_purchase_history:
        recommend_items.append(item_target)
        
        

In [428]:
def recommender(user, top_n):
    user_index = list(cosine.columns).index(user)
    similarity = cosine.iloc[user_index][cosine.iloc[user_index] < 1].sort_values(ascending=False)
    top_n = similarity.index[:top_n]
    top_n_purchase_history = []
    recommend_items = []
    for user in top_n:
        purchased_history = df[df.Customer_ID == user]['Product_ID'].values
        for item in purchased_history:
            if item not in top_n_purchase_history:
                top_n_purchase_history.append(item)
    print(top_n_purchase_history)

    purchased_history_target = df[df.Customer_ID == similarity.name]['Product_ID'].values
    print(purchased_history_target)
    for item_target in top_n_purchase_history:
        if item_target not in purchased_history_target:
            recommend_items.append(item_target)
            
    return recommend_items

In [429]:
recommender('BO-11425102', 10)

['OFF-SU-4327', 'OFF-LA-3273', 'OFF-LA-4693', 'OFF-ST-5702', 'TEC-MA-4190', 'OFF-AR-3477', 'FUR-TA-5069', 'OFF-ST-6024', 'OFF-BI-3723', 'OFF-PA-5853']
['OFF-SU-4327']


['OFF-LA-3273',
 'OFF-LA-4693',
 'OFF-ST-5702',
 'TEC-MA-4190',
 'OFF-AR-3477',
 'FUR-TA-5069',
 'OFF-ST-6024',
 'OFF-BI-3723',
 'OFF-PA-5853']