In [3]:
import pandas as pd
import numpy as np
import catboost

In [4]:
def scorer(y_true, y_pred, num_users=1079572):
    '''
        `y_true` and `y_pred` are dictionaries of type {user: items_list}
        
        `num_users` is the number of users in training set. 
        The scorer expects predictions for exactly `ceil(num_users*0.05)` users
        
        For private and public leaderboard evaluation:
            - for the track one scorer `num_users` is equal to 1079572
            - for the track two `num_users=100000`
    '''
    
    num_users_5p = np.ceil(0.05 * num_users)
    
    # Check everything is correct
    assert type(y_true) == type(y_pred) == dict, 'Need `y_pred` and `y_true` to be dictionaries.'
    assert len(y_pred) == num_users_5p, 'Found predictions for %d users, instead of %d.' % (len(y_pred), num_users_5p)
    assert np.all([len(x) == 5 for x in y_pred.values()]), 'Please, submit exactly 5 items per user.'
    
    # Compute score
    score = 0
    for user, items_pred in y_pred.items():
        items_true = y_true.get(user, [])
        score += len(set(items_true) & set(items_pred)) > 0

    return score / float(len(y_pred)) * 10000.0


In [5]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,id3,user_id,id2,date,id1
0,714,464300,34,1,4
1,714,915655,34,1,4
2,316,262696,42,1,2
3,52,354280,4,1,10
4,581,218912,14,1,10
5,590,1029729,63,1,9
6,279,14946,49,1,4
7,783,14946,58,1,6
8,613,638732,3,1,6
9,254,1072612,37,1,6


In [31]:
users = pd.read_csv("users.csv")
users

Unnamed: 0,user_id,id3,count,cat1,top_subcategory
0,241117,"[130, 670, 126, 382, 489, 716, 324, 486, 834, ...",19041,16,2
1,351837,"[902, 839, 51, 53, 763, 177, 255, 743, 767, 37...",17898,16,47
2,733408,"[160, 353, 862, 717, 105, 442, 147, 617, 707, ...",16963,16,47
3,1009457,"[308, 617, 577, 148, 60, 217, 715, 693, 128, 6...",16001,16,47
4,484840,"[760, 221, 777, 796, 348, 673, 479, 505, 223, ...",14016,16,92
5,125350,"[478, 916, 316, 549, 369, 114, 324, 861, 94, 7...",13304,16,92
6,247312,"[69, 411, 685, 255, 138, 72, 783, 647, 750, 66...",12843,16,2
7,558541,"[766, 860, 685, 485, 14, 458, 146, 150, 596, 2...",12818,16,2
8,785655,"[483, 656, 763, 920, 916, 795, 760, 704, 576, ...",12648,16,92
9,719043,"[650, 217, 200, 131, 671, 492, 800, 719, 119, ...",12097,16,92


In [7]:
users = users['user_id'][0:53979].values.tolist()

In [8]:
df = df[df['user_id'].isin(users)]
df

Unnamed: 0,id3,user_id,id2,date,id1
0,714,464300,34,1,4
3,52,354280,4,1,10
5,590,1029729,63,1,9
6,279,14946,49,1,4
7,783,14946,58,1,6
8,613,638732,3,1,6
9,254,1072612,37,1,6
10,397,327337,46,1,10
11,703,861305,16,1,9
12,324,1072292,39,1,12


In [9]:
df=df.drop_duplicates(['user_id','id3'])

In [10]:
df['interest'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [11]:
matrix = df.pivot(index='user_id', columns='id3', values='interest').fillna(0)

In [12]:
#matrix['user_id'] = matrix.index
matrix

id3,0,1,2,3,4,5,6,7,8,9,...,921,922,923,924,925,926,927,928,929,930
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
93,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
121,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
140,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
252,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
266,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [13]:
users_ids = list(matrix.index)
users_ids[:10]

[11, 27, 53, 93, 121, 140, 216, 239, 252, 266]

In [14]:
users_items_pivot_matrix = matrix.as_matrix()
users_items_pivot_matrix

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  1.,  0., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [16]:
U.shape

(53979, 15)

In [17]:
Vt.shape

(15, 919)

In [18]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [19]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ -2.23555886e-02,   2.24512944e-03,   1.19537321e-04, ...,
          9.86254856e-02,   2.43016302e-02,   2.49547628e-03],
       [  2.04698906e-01,   2.08298780e-01,   3.57883249e-04, ...,
          1.07217625e-01,   1.49955888e-01,   5.72911516e-02],
       [  6.10829876e-02,   1.12573511e-02,   4.91116519e-05, ...,
          1.45541984e-02,   9.71631552e-03,   2.19879212e-02],
       ..., 
       [  1.08628568e-01,   3.94348088e-02,  -5.02783414e-05, ...,
          6.64663467e-01,   3.79343909e-02,  -3.39668243e-02],
       [  3.01884226e-01,   9.68478237e-01,   5.14858232e-04, ...,
          1.07441866e+00,   1.07576161e+00,   1.20087607e-02],
       [  6.07399314e-01,   2.33255261e-02,   1.96207807e-04, ...,
         -3.27978529e-03,   4.03428287e-02,  -8.36255803e-03]])

In [20]:
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = matrix.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,11,27,53,93,121,140,216,239,252,266,...,1179300,1179311,1179337,1179338,1179365,1179384,1179385,1179422,1179427,1179507
id3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.022356,0.204699,0.061083,0.616705,0.559267,-0.004507,0.291274,0.022291,0.082079,-0.042036,...,0.00832,0.142547,0.557763,0.365021,0.01961,0.0381,0.151399,0.108629,0.301884,0.607399
1,0.002245,0.208299,0.011257,0.244826,1.116257,0.030124,0.175236,0.064769,0.080568,-0.014285,...,-0.019801,0.131972,1.153755,0.270568,0.032038,0.031943,1.021344,0.039435,0.968478,0.023326
2,0.00012,0.000358,4.9e-05,-0.000277,0.000504,-5.3e-05,0.000393,-5.1e-05,-9.9e-05,1.6e-05,...,7e-06,0.000331,0.000536,0.0005,0.000105,0.000376,-1.8e-05,-5e-05,0.000515,0.000196
3,0.124673,0.13059,0.897583,0.755859,0.821677,0.882137,-0.117296,1.017214,0.999385,0.907999,...,-0.041592,0.069724,0.870316,0.030071,0.71164,-0.017477,0.038756,0.525658,0.936035,0.101988
4,0.013087,-0.060766,-0.019849,0.659862,-0.009645,0.006145,0.041584,0.005934,0.031856,-0.005736,...,-0.029482,0.096554,-0.006639,0.018474,-0.033689,0.00863,0.033964,0.011658,0.009298,0.031456
5,-0.00148,0.002085,0.003284,0.000232,0.038358,0.002362,0.001438,0.003486,0.005524,-0.00069,...,0.000659,0.000138,0.039792,0.0031,0.000218,0.001227,0.002055,0.001139,0.034086,0.001484
6,0.122584,0.190254,0.778333,0.222309,0.865351,0.882019,-0.039943,0.968162,0.44456,0.561401,...,0.079581,-0.020327,0.869431,0.144502,0.503629,0.011041,0.059521,0.1636,1.07074,0.586283
7,0.013325,-0.015441,-0.015161,0.550994,0.002164,-0.028436,-0.005249,-0.014166,0.025356,0.003838,...,-0.019139,0.043226,0.00184,0.056042,0.006865,-0.009451,0.005394,-0.015808,0.022512,0.025757
8,0.046009,0.072265,-0.00218,0.691008,0.027711,0.012392,0.05102,0.042476,0.735942,0.162272,...,0.000407,0.023734,0.017551,-0.028494,-0.060133,0.004834,0.01577,0.770996,0.038266,0.04494
9,0.030749,0.133322,0.880243,0.697044,0.037494,0.902489,-0.041093,1.008778,1.023633,0.783202,...,-0.016678,0.03796,0.034455,-0.066861,0.527618,0.01103,0.017674,0.639381,0.112078,0.056345


In [47]:
cols = matrix.columns
bt = matrix.apply(lambda x: x > 0)
bt = bt.apply(lambda x: list(cols[x.values]), axis=1)

[14,
 41,
 42,
 45,
 51,
 58,
 69,
 77,
 92,
 95,
 98,
 111,
 114,
 116,
 120,
 122,
 134,
 136,
 137,
 145,
 152,
 162,
 196,
 204,
 215,
 219,
 224,
 236,
 244,
 255,
 269,
 271,
 276,
 280,
 281,
 292,
 310,
 320,
 329,
 340,
 350,
 353,
 367,
 375,
 377,
 385,
 398,
 415,
 444,
 449,
 456,
 458,
 463,
 468,
 479,
 497,
 502,
 509,
 530,
 540,
 545,
 546,
 564,
 566,
 569,
 572,
 581,
 587,
 589,
 595,
 597,
 598,
 601,
 604,
 609,
 610,
 612,
 618,
 626,
 639,
 640,
 648,
 662,
 673,
 674,
 685,
 697,
 698,
 700,
 704,
 705,
 718,
 724,
 725,
 735,
 749,
 788,
 796,
 798,
 799,
 800,
 813,
 822,
 840,
 875,
 903,
 908]

In [50]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, ignore_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.ignore_df = ignore_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=5):
        items_to_ignore = self.ignore_df[user_id]
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['id3'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)
        temp = np.array(recommendations_df.values.tolist())[:,0]
        return temp
    
cf_recommender_model = CFRecommender(cf_preds_df,bt)

In [51]:
result = pd.DataFrame(matrix.index)
result['pred'] = result['user_id'].apply(cf_recommender_model.recommend_items)
result

Unnamed: 0,user_id,pred
0,11,"[586.0, 741.0, 714.0, 669.0, 594.0]"
1,27,"[142.0, 528.0, 113.0, 610.0, 704.0]"
2,53,"[329.0, 714.0, 715.0, 495.0, 215.0]"
3,93,"[768.0, 137.0, 822.0, 875.0, 468.0]"
4,121,"[438.0, 103.0, 459.0, 322.0, 833.0]"
5,140,"[56.0, 726.0, 253.0, 833.0, 635.0]"
6,216,"[69.0, 129.0, 721.0, 618.0, 599.0]"
7,239,"[725.0, 908.0, 642.0, 872.0, 113.0]"
8,252,"[374.0, 352.0, 429.0, 647.0, 509.0]"
9,266,"[412.0, 551.0, 285.0, 180.0, 397.0]"


In [52]:
result[['id3_1','id3_2','id3_3','id3_4','id3_5']] = pd.DataFrame(result.pred.values.tolist(), index= result.index, dtype = int)
result = result.drop(['pred'], axis=1)
result

Unnamed: 0,user_id,id3_1,id3_2,id3_3,id3_4,id3_5
0,11,586,741,714,669,594
1,27,142,528,113,610,704
2,53,329,714,715,495,215
3,93,768,137,822,875,468
4,121,438,103,459,322,833
5,140,56,726,253,833,635
6,216,69,129,721,618,599
7,239,725,908,642,872,113
8,252,374,352,429,647,509
9,266,412,551,285,180,397


In [53]:
result.to_csv("answer3.csv", index = False)