In [1]:
import pandas as pd
import numpy as np
import catboost

def scorer(y_true, y_pred, num_users=1079572):
    '''
        `y_true` and `y_pred` are dictionaries of type {user: items_list}
        
        `num_users` is the number of users in training set. 
        The scorer expects predictions for exactly `ceil(num_users*0.05)` users
        
        For private and public leaderboard evaluation:
            - for the track one scorer `num_users` is equal to 1079572
            - for the track two `num_users=100000`
    '''
    
    num_users_5p = np.ceil(0.05 * num_users)
    
    # Check everything is correct
    assert type(y_true) == type(y_pred) == dict, 'Need `y_pred` and `y_true` to be dictionaries.'
    assert len(y_pred) == num_users_5p, 'Found predictions for %d users, instead of %d.' % (len(y_pred), num_users_5p)
    assert np.all([len(x) == 5 for x in y_pred.values()]), 'Please, submit exactly 5 items per user.'
    
    # Compute score
    score = 0
    for user, items_pred in y_pred.items():
        items_true = y_true.get(user, [])
        score += len(set(items_true) & set(items_pred)) > 0

    return score / float(len(y_pred)) * 10000.0

In [2]:
train = pd.read_csv("train.csv")
train['interest'] = 1

train=train.drop_duplicates(['user_id','id3'])
test = train[train['date']>47]
train = train[train['date']<=47]

In [3]:
users = train[train['date']>=40]['user_id'].value_counts()
len(users)

465719

In [4]:
users = users.index.tolist()[0:53979]

In [5]:
train = train[train['user_id'].isin(users)]
test =  test[test['user_id'].isin(users)]

matrix = train.pivot(index='user_id', columns='id3', values='interest').fillna(0)

In [6]:
test

Unnamed: 0,id3,user_id,id2,date,id1,interest
43522747,860,543948,103,48,8,1
43522751,152,259718,21,48,7,1
43522776,741,918079,87,48,7,1
43522797,162,160288,51,48,9,1
43522814,744,1099279,22,48,7,1
43522831,162,259718,51,48,9,1
43522891,554,545261,22,48,7,1
43522899,597,169133,21,48,7,1
43522908,377,982473,56,48,9,1
43522961,652,379475,64,48,1,1


In [7]:
users_test = test.groupby('user_id')['id3'].apply(list)
users_test = users_test.to_dict()

In [8]:
users_test

{27: [876, 277, 451, 481, 920, 20, 545, 775, 853, 125],
 147: [586],
 151: [927, 586],
 194: [134, 685, 787, 6],
 216: [48, 679, 699, 700, 329],
 256: [554, 279],
 312: [700, 283, 299, 718],
 341: [429],
 390: [250, 901],
 404: [506, 680, 477, 241, 136, 704, 145],
 411: [329,
  415,
  284,
  295,
  310,
  223,
  586,
  662,
  610,
  109,
  392,
  860,
  530,
  41,
  595,
  714,
  151,
  281,
  612,
  725,
  136,
  843],
 423: [611, 882, 257, 896, 563],
 528: [861, 619],
 538: [635],
 574: [41, 329, 875, 680],
 587: [460, 60, 865, 700, 685, 711],
 632: [137, 278, 318, 809, 875, 567, 506],
 650: [29, 755, 348, 137, 32, 13],
 682: [716],
 690: [340,
  237,
  140,
  222,
  753,
  611,
  775,
  716,
  111,
  908,
  705,
  79,
  145,
  322,
  20,
  800,
  872,
  161,
  289,
  666,
  377,
  66,
  29,
  737,
  634,
  685,
  725,
  509,
  841,
  863,
  310,
  134,
  283,
  749,
  320,
  271,
  897,
  77,
  51,
  604,
  58,
  843,
  142,
  564],
 723: [356],
 744: [13, 800],
 757: [185, 774, 610

# CV

In [9]:
matrix

id3,0,1,2,3,4,5,6,7,8,9,...,921,922,923,924,925,926,927,928,929,930
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
205,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
matrix.as_matrix()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [11]:
users_ids = list(matrix.index)
users_items_pivot_matrix = matrix.as_matrix()

In [12]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, ignore_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.ignore_df = ignore_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=5):
        items_to_ignore = self.ignore_df[user_id]
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['id3'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)
        temp = np.array(recommendations_df.values.tolist())[:,0]
        return temp

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

In [None]:
for NUMBER_OF_FACTORS_MF in range(2,7):
    #Performs matrix factorization of the original user item matrix
    U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
    sigma = np.diag(sigma)

    users_ids = list(matrix.index)
    users_items_pivot_matrix = matrix.as_matrix()

    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
    cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = matrix.columns, index=users_ids).transpose()

    cols = matrix.columns
    bt = matrix.apply(lambda x: x > 0)
    bt = bt.apply(lambda x: list(cols[x.values]), axis=1)

    cf_recommender_model = CFRecommender(cf_preds_df,bt)

    result = pd.DataFrame(matrix.index)
    result['pred'] = result['user_id'].apply(cf_recommender_model.recommend_items)
    result = result.values.tolist()

    items = {}
    for line in result:
        line[1] = list(line[1])
        key, value = line[0], line[1]
        items[key] = value

    print(NUMBER_OF_FACTORS_MF, 'SCORE', scorer(users_test, items))

2 SCORE 1450.0083365753349
3 SCORE 1500.0277885844496
4 SCORE 1501.6951036514201
5 SCORE 1484.0956668333981
