In [1]:
import pandas as pd
import numpy as np
import catboost

train = pd.read_csv("train.csv")
train['interest'] = 1

train=train.drop_duplicates(['user_id','id3'])
#users = train[train['date']>=50]['user_id'].value_counts()
users = train[train['date']>=47]['user_id'].value_counts()
users = users.index.tolist()[0:53979]

train = train[train['user_id'].isin(users)]
matrix = train.pivot(index='user_id', columns='id3', values='interest').fillna(0)

users_ids = list(matrix.index)
users_items_pivot_matrix = matrix.as_matrix()

In [2]:
len(users)

53979

In [3]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, ignore_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.ignore_df = ignore_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=5):
        items_to_ignore = self.ignore_df[user_id]
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['id3'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)
        temp = np.array(recommendations_df.values.tolist())[:,0]
        return temp

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

In [5]:
NUMBER_OF_FACTORS_MF = 4

U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
sigma = np.diag(sigma)

users_ids = list(matrix.index)
users_items_pivot_matrix = matrix.as_matrix()

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = matrix.columns, index=users_ids).transpose()

cols = matrix.columns
bt = matrix.apply(lambda x: x > 0)
bt = bt.apply(lambda x: list(cols[x.values]), axis=1)

cf_recommender_model = CFRecommender(cf_preds_df,bt)

result = pd.DataFrame(matrix.index)
result['pred'] = result['user_id'].apply(cf_recommender_model.recommend_items)

In [6]:
result

Unnamed: 0,user_id,pred
0,27,"[69.0, 610.0, 142.0, 113.0, 247.0]"
1,30,"[822.0, 41.0, 586.0, 456.0, 377.0]"
2,45,"[581.0, 51.0, 800.0, 456.0, 377.0]"
3,51,"[581.0, 51.0, 310.0, 377.0, 329.0]"
4,66,"[134.0, 204.0, 800.0, 136.0, 51.0]"
5,67,"[581.0, 51.0, 41.0, 329.0, 800.0]"
6,130,"[134.0, 113.0, 204.0, 13.0, 506.0]"
7,142,"[872.0, 69.0, 727.0, 134.0, 619.0]"
8,143,"[415.0, 586.0, 142.0, 619.0, 872.0]"
9,148,"[581.0, 875.0, 69.0, 51.0, 142.0]"


In [7]:
result[['id3_1','id3_2','id3_3','id3_4','id3_5']] = pd.DataFrame(result.pred.values.tolist(), index= result.index)
result = result.drop(['pred'], axis=1)
result

Unnamed: 0,user_id,id3_1,id3_2,id3_3,id3_4,id3_5
0,27,69.0,610.0,142.0,113.0,247.0
1,30,822.0,41.0,586.0,456.0,377.0
2,45,581.0,51.0,800.0,456.0,377.0
3,51,581.0,51.0,310.0,377.0,329.0
4,66,134.0,204.0,800.0,136.0,51.0
5,67,581.0,51.0,41.0,329.0,800.0
6,130,134.0,113.0,204.0,13.0,506.0
7,142,872.0,69.0,727.0,134.0,619.0
8,143,415.0,586.0,142.0,619.0,872.0
9,148,581.0,875.0,69.0,51.0,142.0


In [8]:
result.to_csv("answer_4.csv", index = False)