In [78]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")

In [3]:
customers = pd.read_csv("https://raw.githubusercontent.com/knkemree/python/master/Book1.csv")
user_id = list(customers["user_id"].unique())
user_id

[1100, 6, 10, 8, 12, 1, 685, 1020, 1104, 1102, 1087, 737, 1101, 1103]

In [4]:
transactions = pd.read_csv("https://raw.githubusercontent.com/knkemree/python/master/transactions.csv")

In [5]:
print(customers.shape)
customers.head()

(801, 1)


Unnamed: 0,user_id
0,1100
1,1100
2,1100
3,1100
4,1100


In [6]:
print(transactions.shape)
transactions.head()

(2023, 2)


Unnamed: 0,user_id,product_id
0,1100.0,fe849f4f-904a-4e77-a0f8-05a7dadde04e
1,1100.0,5df057fc-9f95-4e93-8e84-d8d8b6313786
2,1100.0,beba627b-fad2-4d4f-aa8c-be0938bfa8a5
3,1100.0,3a2ed5a6-a99f-4556-9a3d-f3b2573c246f
4,1100.0,e9135d4c-e363-48c3-ba7b-7a18b3145f0a


In [7]:
data = pd.melt(transactions.set_index('user_id')['product_id'].apply(pd.Series).reset_index(), 
             id_vars=['user_id'],
             value_name='product_id') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['user_id', 'product_id']) \
    .agg({'product_id': 'count'}) \
    .rename(columns={'product_id': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'product_id': 'productId'})
data['user_id'] = data['user_id'].astype(np.int64)

In [8]:
data.shape
data.head()

Unnamed: 0,user_id,productId,purchase_count
0,1,10686,1
1,1,13,1
2,1,1560,1
3,1,2092,1
4,1,2226,1


In [9]:
data.to_csv("pmpp_transaction.csv")

In [10]:
data.sort_values(by=['purchase_count'], ascending = False)

Unnamed: 0,user_id,productId,purchase_count
160,737,249,7
153,737,200,6
956,1100,23eb0740-b1da-455b-8105-ef628fd8adb9,6
262,737,422,5
316,737,86,5
...,...,...,...
613,1020,8ef5bf09-9c27-479e-a8f5-9e2d6b30c9a9,1
612,1020,8ef5bf09-2aec-4de2-8340-b5ecf57eefc6,1
611,1020,8ef5bf09-1827-4d5d-ab50-738ec8651201,1
610,1020,8ef5bf08-a727-4372-9108-4bde31ee2631,1


In [11]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [12]:
train_data, test_data = split_data(data)

In [13]:
test_data

user_id,productId,purchase_count
1020,9bc3abf8-67a5-4718-8b9b-1 cd88c7b99fa ...,1
1098,7b236b39-576e-4a6d-8611-8 5ba71387ba5 ...,1
1098,d4c3d6f3-bf98-4acc- acc7-0d87a00336aa ...,1
1101,adc4b06b-b15e-4387-9479-f bca2dc85799 ...,1
1103,8ed56fcd-eb54-449b-9ffd-9 572c1de024d ...,1
1101,3921b850-e11d-47c5-b0f7-5 c162ac33f69 ...,1
1101,6537a656-bf8b-41f7-95d8-9 25517a5be67 ...,1
737,2788,1
1020,9737,2
1114,8f03d7ab-012c-48db-a01f-9 f971c7c3695 ...,1


In [14]:
train_data

user_id,productId,purchase_count
1020,1ebe39c4-72a8-4d8b-a45c-2 145f5d96054 ...,1
737,2889,1
737,8748,1
1107,8ef77473-a065-4394-985e-4 d649b58f711 ...,2
1107,8f1e1161-3754-4694-9e8c-1 50de408482b ...,1
1020,4266,1
1100,e9135d4c-e363-48c3-ba7b-7 a18b3145f0a ...,3
1020,3831,1
1096,9bfa4e1d-299b-47f0-b5ed- ccb9626dbeea ...,1
8,6537a656-bf8b-41f7-95d8-9 25517a5be67 ...,1


In [56]:
# constant variables to define field names include:
user_id = 'user_id'
item_id = 'productId'
users_to_recommend = list(customers[user_id].unique())
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [109]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
    elif name == 'jaccard':
        model = tc.item_similarity_recommender.create(train_data,
                                                           user_id=user_id,
                                                            item_id=item_id,
                                                            target=target,
                                                            similarity_type='jaccard')
    elif name =='ranking_factorization':
        model = tc.ranking_factorization_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'factorization':
        model = tc.factorization_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [86]:
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-------------------------------+--------------------+------+
| user_id |           productId           |       score        | rank |
+---------+-------------------------------+--------------------+------+
|   1100  |               86              |        5.0         |  1   |
|   1100  |              422              |        3.5         |  2   |
|   1100  |              200              |        3.5         |  3   |
|   1100  |              456              |        3.0         |  4   |
|   1100  |              250              |        3.0         |  5   |
|   1100  |              458              |        3.0         |  6   |
|   1100  |              459              |        3.0         |  7   |
|   1100  |               87              |        3.0         |  8   |
|   1100  |              423              |        2.5         |  9   |
|   1100  |              144              | 2.3333333333333335 |  10  |
|    6    |               86              |        5.0         |

In [87]:
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-------------------------------+----------------------+------+
| user_id |           productId           |        score         | rank |
+---------+-------------------------------+----------------------+------+
|   1100  | 07e19476-ebc2-415c-bbef-88... | 0.14932230927727438  |  1   |
|   1100  | 59227abe-045b-4564-b998-a1... | 0.10780241001736034  |  2   |
|   1100  | f68dea18-76dc-463e-a3a2-27... | 0.10491615082278397  |  3   |
|   1100  | 86cee7f5-5ff1-4cf1-a980-dc... | 0.08381533622741699  |  4   |
|   1100  | ccb778dc-79d0-4aed-b358-1e... |  0.0652653511726495  |  5   |
|   1100  | 368bda2e-b213-4025-a90f-0d... | 0.06426885814377756  |  6   |
|   1100  | 9cadd6eb-777c-4d40-a186-07... | 0.05340827414483735  |  7   |
|   1100  | ca380b7f-37dd-44b0-9e0d-34... |  0.0529780342723384  |  8   |
|   1100  | fe849f4f-904a-4e77-a0f8-05... | 0.051143778092933426 |  9   |
|   1100  | bee36a53-75a7-4d73-8ed1-be... | 0.05098036654067762  |  10  |
|    6    |              102          

In [105]:
name = 'pearson'
target = 'purchase_count'
pea = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-------------------------------+--------------------+------+
| user_id |           productId           |       score        | rank |
+---------+-------------------------------+--------------------+------+
|   1100  |               86              |        5.0         |  1   |
|   1100  |              422              | 3.4972643102660323 |  2   |
|   1100  |              200              | 3.4972643102660323 |  3   |
|   1100  |              456              |        3.0         |  4   |
|   1100  |              458              |        3.0         |  5   |
|   1100  |              459              |        3.0         |  6   |
|   1100  |              250              | 2.9972643102660323 |  7   |
|   1100  |               87              | 2.9972643102660323 |  8   |
|   1100  |              423              | 2.4972643102660323 |  9   |
|   1100  |              144              | 2.4094863624283764 |  10  |
|    6    |               86              |        5.0         |

In [89]:
name = 'jaccard'
target = 'purchase_count'
jac = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-------------------------------+---------------------+------+
| user_id |           productId           |        score        | rank |
+---------+-------------------------------+---------------------+------+
|   1100  | ca380b7f-37dd-44b0-9e0d-34... | 0.22941918987216373 |  1   |
|   1100  | ccb778dc-79d0-4aed-b358-1e... | 0.21654040163213556 |  2   |
|   1100  | f68dea18-76dc-463e-a3a2-27... | 0.21369047779025455 |  3   |
|   1100  | bee36a53-75a7-4d73-8ed1-be... |  0.208964644056378  |  4   |
|   1100  | 86cee7f5-5ff1-4cf1-a980-dc... | 0.20189393831021857 |  5   |
|   1100  | 3f30ca95-6e6e-4a43-aa76-8f... | 0.18549783302075934 |  6   |
|   1100  | 000051b3-1cbd-47d3-8e00-e2... | 0.18323111714738788 |  7   |
|   1100  | fe53696f-5592-433d-a8e2-48... | 0.18146344206549905 |  8   |
|   1100  | 368bda2e-b213-4025-a90f-0d... | 0.17433260877927145 |  9   |
|   1100  | 07e19476-ebc2-415c-bbef-88... | 0.17159090800718826 |  10  |
|    6    |              4414             |        

In [106]:
name = 'ranking_factorization'
target = 'purchase_count'
r_fac = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)


+---------+-------------------------------+---------------------+------+
| user_id |           productId           |        score        | rank |
+---------+-------------------------------+---------------------+------+
|   1100  |              419              |  1.4828988527286924 |  1   |
|   1100  | 3f30ca95-6e6e-4a43-aa76-8f... |  0.8090777610767759 |  2   |
|   1100  |              197              |  0.7274193977344907 |  3   |
|   1100  |              100              | 0.49497010459789403 |  4   |
|   1100  |              423              |  0.4863092516888059 |  5   |
|   1100  | 59227abe-045b-4564-b998-a1... | 0.48256882181057104 |  6   |
|   1100  | fe849f4f-904a-4e77-a0f8-05... |  0.4710294221867002 |  7   |
|   1100  |              418              | 0.46731700649150976 |  8   |
|   1100  | 8dd4935c-1124-40d9-8116-74... |  0.4208735441196836 |  9   |
|   1100  | b9aa48d7-1589-4580-ad0d-5b... |  0.4015895341862119 |  10  |
|    6    | eb379655-d3c3-42bd-95c0-c8... |  1.5803

In [112]:
name = 'factorization'
target = 'purchase_count'
fac = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)


+---------+-------------------------------+--------------------+------+
| user_id |           productId           |       score        | rank |
+---------+-------------------------------+--------------------+------+
|   1100  |              3585             | 5.529357693098871  |  1   |
|   1100  |              3584             | 5.5149948810566345 |  2   |
|   1100  |               86              | 4.663260719679681  |  3   |
|   1100  |              458              |  4.33412756671795  |  4   |
|   1100  | 8efb75a0-8832-4212-a0f9-54... | 4.304685733221857  |  5   |
|   1100  |             10416             |  4.29645564308056  |  6   |
|   1100  | 8ef77473-a065-4394-985e-4d... | 4.292603514098016  |  7   |
|   1100  | 8ed56fcd-e7c1-40ac-8081-65... | 4.291946074866144  |  8   |
|   1100  | 8ed56fcd-7cf7-48b6-80aa-c7... | 4.288718244932977  |  9   |
|   1100  |              3567             | 4.284606597327081  |  10  |
|    6    |               86              | 4.329146704577295  |

In [113]:
models_w_counts = [popularity, cos, pear, jac,r_fac, fac]
names_w_counts = ['Popularity Model on Purchase Counts', 
                  'Cosine Similarity on Purchase Counts', 
                  'Pearson Similarity on Purchase Counts',
                 'Jaccard Similarity on Purchase Counts',
                 'Ranking Factorization on Purchase Counts',
                 'Factorization on Purchase Counts']
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts

Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    | 0.13043478260869565 | 0.007155797101449276 |
|   2    | 0.13043478260869565 | 0.011241765480895918 |
|   3    | 0.13043478260869568 | 0.02036443628834933  |
|   4    | 0.13043478260869565 | 0.025449840015057405 |
|   5    | 0.12173913043478261 | 0.028547666102013928 |
|   6    | 0.13043478260869565 | 0.035001646903820814 |
|   7    | 0.13043478260869562 | 0.03950475249388293  |
|   8    | 0.13043478260869565 | 0.044590156220590996 |
|   9    |  0.1352657004830918 | 0.056073075475249395 |
|   10   | 0.13478260869565217 | 0.060723696593261815 |
+--------+---------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.1208535533775505

Per User RMSE (best)
+---------+--------------

  + 'the next major release. Any passed parameters are ignored.')


+---------+--------------------+-------+
| user_id |        rmse        | count |
+---------+--------------------+-------+
|   1100  | 2.8014887050052817 |   20  |
+---------+--------------------+-------+
[1 rows x 3 columns]


Per Item RMSE (best)
+-----------+-----------------------+-------+
| productId |          rmse         | count |
+-----------+-----------------------+-------+
|   10415   | 0.0019279057752215056 |   1   |
+-----------+-----------------------+-------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+-----------+-------------------+-------+
| productId |        rmse       | count |
+-----------+-------------------+-------+
|    249    | 4.596765360582685 |   2   |
+-----------+-------------------+-------+
[1 rows x 3 columns]

PROGRESS: Evaluate model Cosine Similarity on Purchase Counts

Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------

In [114]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_count', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+---------+-------------------------------+----------------------+------+
| user_id |           productId           |        score         | rank |
+---------+-------------------------------+----------------------+------+
|   1100  | 8b71484b-b0ba-4f86-87cc-25... | 0.036291509173637215 |  1   |
|   1100  |             10410             | 0.016541809536689937 |  2   |
|   1100  |             10401             | 0.016541809536689937 |  3   |
|   1100  |             10396             | 0.016541809536689937 |  4   |
|   1100  |             10277             | 0.016541809536689937 |  5   |
|   1100  |             10168             | 0.016541809536689937 |  6   |
|   1100  |             10164             | 0.016541809536689937 |  7   |
|   1100  |             10138             | 0.016541809536689937 |  8   |
|   1100  |             10122             | 0.016541809536689937 |  9   |
|   1100  |              101              | 0.016541809536689937 |  10  |
|    6    |               82          

In [115]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(140, 4)


Unnamed: 0,user_id,productId,score,rank
0,1100,8b71484b-b0ba-4f86-87cc-250c83e921ad,0.036292,1
1,1100,10410,0.016542,2
2,1100,10401,0.016542,3
3,1100,10396,0.016542,4
4,1100,10277,0.016542,5


In [154]:
df_rec.to_json("recommended_products.json")