In [81]:
import pandas as pd
import numpy as np
import turicreate as tc


## Reduce The Memory Usage (Kaggle)

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [4]:
print('-' * 80)
print('transactions')
transactions = import_data("transactions.csv")

print('-' * 80)
print('articles')
customers = pd.read_csv("customers.csv",  delimiter=',')

print('-' * 80)
print('customer')
articles = import_data("articles.csv")



--------------------------------------------------------------------------------
transactions
Memory usage of dataframe is 1212.63 MB
Memory usage after optimization is: 436.77 MB
Decreased by 64.0%
--------------------------------------------------------------------------------
articles
--------------------------------------------------------------------------------
customer
Memory usage of dataframe is 20.13 MB
Memory usage after optimization is: 7.25 MB
Decreased by 64.0%


In [121]:
df_inner = pd.merge(transactions, articles, on='article_id', how='inner')
df_inner = pd.merge(df_inner, customers, on='customer_id', how='inner')

# Collobarative Filtering

### Basic model without preprocessing or model parameter selection
### Model is suggested by turicreate automaticaly

In [116]:
# lets do collobarative filtering

# basic model without preprocessing or model parameter selection
# model is suggested by turicreate automaticaly

# read data in SArray format
transactions_data = tc.SFrame.read_csv("transactions.csv")

#split train and test set
training_data, test_data = tc.recommender.util.random_split_by_user(transactions_data, 'customer_id', 'article_id')

#generate the model suggested automaticaly
model = tc.recommender.create(training_data, 'customer_id', 'article_id')

#make recommendations
results = model.recommend()

#evaluate the precision and recall 
eval_rec = model.evaluate(test_data)

#save the model
model.save("recommendations_automatic_suggest.model")

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,int,float,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------





Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.10821382007822686  | 0.03460330997703863 |
|   2    | 0.08083441981747069  | 0.05330868740854747 |
|   3    | 0.06823120382442417  | 0.06864411975832141 |
|   4    | 0.056714471968709254 | 0.07538340420041116 |
|   5    | 0.048761408083441954 | 0.07999686983082592 |
|   6    | 0.04411125597566277  | 0.08343824464845659 |
|   7    | 0.040975973179362996 | 0.08705899084417022 |
|   8    | 0.03715775749674058  | 0.08989331252758982 |
|   9    | 0.035057221497899416 | 0.09359704985641573 |
|   10   | 0.03350717079530637  | 0.09756082049866265 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]



# Preprocessing and  Model Optimization

In [122]:
#let's manipulate the data and model

# delete non frequent users and rare items from the data
# give side informatiton of users and items to the models
# try different models that work better in implicit data sets


big_data = df_inner.copy()

#rare items
a = pd.DataFrame(big_data.groupby(["article_id"]).count().t_dat < 20)
rare_articles = np.array(a[a.t_dat == True].index)
print(rare_articles.shape)

#non frequent users
b = pd.DataFrame(big_data.groupby(["customer_id"]).count().t_dat < 3)
rare_users = np.array(b[b.t_dat == True].index)
print(rare_users.shape)



(31585,)
(258955,)


## Delete non frequent users and rare items from the data

In [123]:
#delete the fon fraquent user and rare items

article_copy = articles.copy()
indexNames = article_copy[article_copy['article_id'].isin(rare_articles)].index
article_copy = article_copy.drop(indexNames)

customer_copy = customers.copy()
indexNames = customer_copy[customer_copy['customer_id'].isin(rare_users)].index
customer_copy = customer_copy.drop(indexNames)


In [124]:
# join the tables with new clean data

df_inner = pd.merge(transactions, article_copy, on='article_id', how='inner')
df_inner = pd.merge(df_inner, customer_copy, on='customer_id', how='inner')



In [125]:
df_inner.shape

(31172239, 32)

## Give side information of users and items to the models

In [77]:
df = pd.DataFrame(dict(customer_id=df_inner["customer_id"], article_id=df_inner["article_id"], price = df_inner["price"], 
                       prod_name = df_inner["prod_name"] , product_type_name = df_inner["product_type_name"], product_group_name = df_inner["product_group_name"] ,
                      graphical_appearance_name = df_inner["graphical_appearance_name"], colour_group_name = df_inner["colour_group_name"],
                      perceived_colour_value_name = df_inner["perceived_colour_value_name"], perceived_colour_master_name = df_inner["perceived_colour_master_name"],
                      department_name = df_inner["department_name"],index_name = df_inner["index_name"] , detail_desc = df_inner["detail_desc"],
                      club_member_status = df_inner["club_member_status"],fashion_news_frequency = df_inner["fashion_news_frequency"],age = df_inner["age"]))

In [83]:
df.to_csv('clean_data.csv') 


## Try different models that work better in implicit data sets

## Item Similarity Based Model

In [102]:
df = tc.SFrame.read_csv("clean_data.csv")

training_data, test_data = tc.recommender.util.random_split_by_user(df, 'customer_id', 'article_id',max_num_users=100000)


m = tc.item_similarity_recommender.create(training_data,
                                    user_id='customer_id',
                                    item_id='article_id')

recommendations = m.recommend(k = 5)


eval_rec = m.evaluate(test_data)

#The program's precision is then 5/8 (true positives / selected elements) 
#while its recall is 5/12 (true positives / relevant elements).

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,int,float,str,str,str,str,str,str,str,str,str,str,str,str,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------



Precision and recall summary statistics by cutoff




+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.11775191236188502  | 0.031377962446044655 |
|   2    | 0.09005217678723175  | 0.04628957226884209  |
|   3    |  0.0750110177227938  | 0.05681731492053035  |
|   4    | 0.06497308527717395  | 0.06497319828907787  |
|   5    | 0.057852488431388405 | 0.07149062981367693  |
|   6    | 0.05251322126735273  | 0.07738980318829454  |
|   7    | 0.04837735925421445  | 0.08253251620390485  |
|   8    | 0.044955260175653974 | 0.08722525481498467  |
|   9    |  0.0421034406774338  | 0.09143228903172061  |
|   10   | 0.03968977240532521  | 0.09513760161743244  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]



## Matrix Factorization Based Model

In [114]:


df = tc.SFrame.read_csv("clean_data.csv")

df = df.dropna()

training_data, test_data = tc.recommender.util.random_split_by_user(df, 'customer_id', 'article_id',max_num_users=100000)


m = tc.ranking_factorization_recommender.create(training_data,
                                    user_id='customer_id',
                                    item_id='article_id')

recommendations = m.recommend(k = 10)


eval_rec = m.evaluate(test_data)

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,int,float,str,str,str,str,str,str,str,str,str,str,str,str,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------



Precision and recall summary statistics by cutoff




+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0005988299783499932 | 0.0001350143930796732 |
|   2    | 0.0006333778617163371 | 0.0003409416727041624 |
|   3    | 0.0005873140172278768 | 0.0004947529141325089 |
|   4    | 0.0005614031047031195 | 0.0006394712793054712 |
|   5    | 0.0005458565571882689 | 0.0007824673177553706 |
|   6    | 0.0005393308458857283 | 0.0009037225380834232 |
|   7    |  0.00051328283858571  | 0.0010079752347723749 |
|   8    | 0.0005023838039522788 | 0.0011249811725955092 |
|   9    | 0.0004939067770151676 | 0.0011854134160454517 |
|   10   |  0.000481367174904421 |  0.00131110348418433  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]



# Check the Performance with Other Way 

In [126]:
#find last transaction of every customer who made more than 1 transactions

all_customers = customer_copy["customer_id"]

#sort transaction from recent date to old date

sorted_transactions= df_inner.sort_values(['t_dat'], ascending=False)



## Split the Data According to Customers' Last Transaction and Previous Transactions

In [130]:
# record last transactions of every customers

last_transaction = {}
for i in all_customers:
    last_transaction[i] = list()
    
delete_cust = []

for i,v in  sorted_transactions.iterrows():
    
    if v["customer_id"] not in last_transaction.keys():
        pass
    else:
        a = last_transaction[v["customer_id"]]
        if len(a) < 3:
            last_transaction[v["customer_id"]].append([i,v["article_id"], v["product_type_name"],v["colour_group_name"]])
        if not a:
            delete_cust.append(i)
            
last_transaction

{'00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657': [[16544064,
   568601043,
   'Blazer',
   'Dark Green'],
  [16544063, 890498002, 'Jacket', 'Greenish Khaki'],
  [16544061, 859416011, 'Sweater', 'Black']],
 '0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa': [[1768722,
   826211002,
   'Sweater',
   'Greenish Khaki'],
  [1768741, 811835004, 'Bikini top', 'Light Yellow'],
  [1768740, 811835004, 'Bikini top', 'Light Yellow']],
 '000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318': [[17,
   794321007,
   'Jacket',
   'Greyish Beige'],
  [11, 852643003, 'Sweater', 'Greenish Khaki'],
  [12, 750424014, 'Trousers', 'Light Blue']],
 '00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a': [[12493639,
   730683050,
   'Unknown',
   'Black'],
  [12493640, 927530004, 'Dress', 'Black'],
  [12493641, 791587015, 'T-shirt', 'Green']],
 '000064249685c11552da43ef22a5030f35a147f723d5b02ddd9fd22452b1f5a6': [[30101272,
   740962001,
   'Leggings/

In [141]:
delete_cust2 = []

for i in all_customers:
    if last_transaction[i] : 
        delete_cust2.append(last_transaction[i][0][0])

In [143]:
print(df_inner.shape)
df_inner_copy = df_inner.copy()
df_inner_copy = df_inner_copy.drop(delete_cust2)
print(df_inner_copy.shape)

(31172239, 32)
(30068998, 32)


## Run the Relatively Better Model

In [145]:
rec_data = pd.DataFrame(dict(customer_id = df_inner_copy["customer_id"], article_id = df_inner_copy["article_id"]))

rec_data.to_csv('rec_data.csv') 

# read data in SArray format
rec_data = tc.SFrame.read_csv("rec_data.csv")

#split train and test set
training_data, test_data = tc.recommender.util.random_split_by_user(rec_data, 'customer_id', 'article_id')

#generate the model suggested automaticaly
model = tc.recommender.create(training_data, 'customer_id', 'article_id')

#make recommendations
results = model.recommend()

#evaluate the precision and recall 
eval_rec = model.evaluate(test_data)

#save the model
model.save("recommendations_automatic_suggest_last.model")


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------





Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    | 0.11138310893512852 | 0.030403776418522828 |
|   2    | 0.08812729498164017 | 0.04382235209784236  |
|   3    | 0.07466340269277842 | 0.05517706993029129  |
|   4    | 0.06609547123623012 |  0.0625757703571584  |
|   5    |  0.0587515299877601 | 0.06686129559581348  |
|   6    | 0.05405956752345982 |  0.0746492995613915  |
|   7    | 0.04913446406714462 | 0.07875560535201423  |
|   8    | 0.04482864137086902 | 0.08162412599709791  |
|   9    | 0.04229566163470689 |  0.0856151873839871  |
|   10   | 0.03953488372093021 | 0.08826119498715503  |
+--------+---------------------+----------------------+
[10 rows x 3 columns]



## Converting Implicit Data to Explicit (Just a Trial)

In [148]:
rec_data_exp = pd.DataFrame(dict(customer_id = df_inner_copy["customer_id"], article_id = df_inner_copy["article_id"], rating = [5]*len( df_inner_copy["article_id"]) ))

rec_data_exp.to_csv('rec_data_explicit.csv') 

# read data in SArray format
rec_data_exp = tc.SFrame.read_csv("rec_data_explicit.csv")

#split train and test set
training_data_exp, test_data_exp = tc.recommender.util.random_split_by_user(rec_data_exp, 'customer_id', 'article_id',max_num_users=200000)

#generate the model suggested automaticaly
model_exp = tc.recommender.create(training_data_exp,user_id='customer_id',
                                    item_id='article_id',target='rating')

#make recommendations
results_exp = model_exp.recommend(k=5)

#evaluate the precision and recall 
eval_rec_exp = model_exp.evaluate(test_data_exp)

#save the model
model.save("recommendations_automatic_suggest_last_exp.model")



------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.004601245762815621 |  0.001341387903804375 |
|   2    |  0.00446969418847972  | 0.0025799898545261677 |
|   3    |  0.004380973359276585 |  0.003731576032976013 |
|   4    | 0.0043274349278608045 |  0.004895169104419165 |
|   5    |  0.004240243768126655 |  0.005857060254933924 |
|   6    |  0.004180076769011683 |  0.006828283713885988 |
|   7    |  0.004163323246255812 |  0.007825066825303227 |
|   8    |  0.00411328120219784  |  0.008742497999397227 |
|   9    |  0.004055323725648926 |  0.009630279891422098 |
|   10   |  0.003983259297331339 |  0.010413498806835545 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.2765524305681815

Per User RMSE (best)
+-------------------------------+-----------

In [234]:
res = results.to_dataframe()


In [235]:
res


Unnamed: 0,customer_id,article_id,score,rank
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,699080001,0.014233,1
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,727808002,0.014166,2
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,458543001,0.010471,3
3,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,609719001,0.009801,4
4,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,852643004,0.009309,5
...,...,...,...,...
11029065,45a78e9b050490ff3a368eb8b1a5c2a02899518d54f04a...,563313006,0.031250,6
11029066,45a78e9b050490ff3a368eb8b1a5c2a02899518d54f04a...,680391007,0.030303,7
11029067,45a78e9b050490ff3a368eb8b1a5c2a02899518d54f04a...,671809010,0.026316,8
11029068,45a78e9b050490ff3a368eb8b1a5c2a02899518d54f04a...,550827002,0.025641,9


## Return Each Recommended Item's Category

In [171]:
# generate a hash table that store the each product's type
type_of = {}
for i,v in articles.iterrows():
    if v["article_id"] not in type_of.keys():
        type_of[v["article_id"]]= v["product_type_name"]
    else:
        pass



## Find most Recommended Category(Mode)

In [236]:
res['type'] = res.apply(lambda row : type_of[row[1]], axis=1)

In [239]:
mode = lambda x: x.mode() if len(x) > 2 else np.array(x)

In [240]:
las = res.groupby('customer_id')['type'].agg(mode).reset_index()

In [251]:
las

Unnamed: 0,customer_id,type
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,Trousers
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,Swimwear bottom
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,Bikini top
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,Underwear body
4,000064249685c11552da43ef22a5030f35a147f723d5b0...,Vest top
...,...,...
1102902,ffffa28cd7ab5d1cbbbfe7b582b1c419270cc0539f3dae...,Trousers
1102903,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,Bikini top
1102904,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,Trousers
1102905,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,Trousers


In [252]:

count = 0

for i,v in las.iterrows():
    if last_transaction[v["customer_id"]][0][2] in v["type"]:
        count +=  1
        
print(count/len(las))

0.2418118662770297


## Find Recommended Categories

In [258]:
sett = lambda x: set(x)

In [259]:
lass = res.groupby('customer_id')['type'].agg(sett).reset_index()

In [260]:
lass

Unnamed: 0,customer_id,type
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"{Jacket, Vest top, Sweater, Skirt, Dress, Trou..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"{Vest top, Bikini top, Swimwear bottom}"
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"{Sweater, Bikini top, Swimwear bottom}"
3,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"{Bikini top, Blouse, Underwear body, T-shirt, ..."
4,000064249685c11552da43ef22a5030f35a147f723d5b0...,"{Bra, Vest top, Leggings/Tights}"
...,...,...
1102902,ffffa28cd7ab5d1cbbbfe7b582b1c419270cc0539f3dae...,{Trousers}
1102903,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,"{Bra, Bikini top, Swimwear bottom, Underwear b..."
1102904,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,"{Trousers, Vest top, Bikini top, Shirt}"
1102905,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,"{Vest top, Trousers, T-shirt, Leggings/Tights}"


In [265]:
count2 = 0

for i,v in lass.iterrows():
    if last_transaction[v["customer_id"]][0][2] in v["type"]:
        count2 +=  1
        
print(count2/len(las))

0.485196847966329
