In [1]:
from sklearn.metrics import ndcg_score, dcg_score
import numpy as np

true = np.asarray([[3, 2, 1, 0, 0]])
relevance = np.asarray([[3, 2, 0, 0, 1]])

print(ndcg_score(true, relevance))

0.980840401274087


In [2]:
true = np.asarray([[2, 4, 1, 1, 1]])
relevance = np.asarray([[2, 5, 2, 3, 1]])

print(ndcg_score(true, relevance))

0.9748317848747132


In [3]:
import pandas as pd

In [4]:
ratings = pd.read_csv('data/dst-3.0_mathml_14_5_rating.csv')
movies = pd.read_csv('data/movie.csv')

In [5]:
df=pd.merge(ratings, movies, how='left',on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
df['title'].value_counts().sort_values(ascending=False)

Pulp Fiction (1994)                                                              67310
Forrest Gump (1994)                                                              66172
Shawshank Redemption, The (1994)                                                 63366
Silence of the Lambs, The (1991)                                                 63299
Jurassic Park (1993)                                                             59715
                                                                                 ...  
Play Motel (1979)                                                                    1
Whity (1971)                                                                         1
I Am Taraneh, I Am Fifteen Years Old (Man, taraneh, panzdah sal daram) (2002)        1
Great Directors (2009)                                                               1
Série noire (1979)                                                                   1
Name: title, Length: 26729, dtype: int64

In [7]:
average_rating_df = df[["title", "rating"]].groupby('title').mean()

sorted_average_ratings = average_rating_df.sort_values(by=["rating",'title'], ascending=False)

print(sorted_average_ratings.head(10))

                                                   rating
title                                                    
Yonkers Joe (2008)                                    5.0
Year Zero: The Silent Death of Cambodia (1979)        5.0
Who Killed Vincent Chin? (1987)                       5.0
When I Walk (2013)                                    5.0
Welcome to Australia (1999)                           5.0
Victor and the Secret of Crocodile Mansion (2012)     5.0
Turkish Dance, Ella Lola (1898)                       5.0
This Thing With Sarah (2013)                          5.0
The great match (2007)                                5.0
The Wrecking Crew (2008)                              5.0


In [8]:
movie_popularity = df["title"].value_counts()
popular_movies = movie_popularity[movie_popularity > 50].index

print(popular_movies.shape[0])

10472


In [9]:
movie_popularity = df["title"].value_counts()
popular_movies = movie_popularity[movie_popularity > 50].index
df2 = df[df.title.isin(popular_movies)]
average_rating_df2 = df2[["title", "rating"]].groupby('title').mean()

sorted_average_ratings = average_rating_df2.sort_values(by=["rating",'title'], ascending=False)

print(sorted_average_ratings.head(10))

                                                 rating
title                                                  
Shawshank Redemption, The (1994)               4.446990
Godfather, The (1972)                          4.364732
Usual Suspects, The (1995)                     4.334372
Schindler's List (1993)                        4.310175
Godfather: Part II, The (1974)                 4.275641
Seven Samurai (Shichinin no samurai) (1954)    4.274180
Rear Window (1954)                             4.271334
Band of Brothers (2001)                        4.263182
Casablanca (1942)                              4.258327
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)  4.256935


In [10]:

articles_df = pd.read_csv('data/shared_articles.csv')
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.shape

(3047, 13)

In [11]:
interactions_df = pd.read_csv('data/users_interactions.csv')

In [12]:
interactions_df = pd.read_csv('data/users_interactions.csv')
interactions_df.personId = interactions_df.personId.astype(str)
interactions_df.contentId = interactions_df.contentId.astype(str)
articles_df.contentId = articles_df.contentId.astype(str)
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}
interactions_df['eventStrength'] = interactions_df.eventType.apply(lambda x: event_type_strength[x])
interactions_df['eventStrength'].mean()

1.2362885828078327

In [13]:
users_interactions_count_df = (
    interactions_df
    .groupby(['personId', 'contentId'])
    .first()
    .reset_index()
    .groupby('personId').size())

users_with_enough_interactions_df = \
    users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]

print(len(users_with_enough_interactions_df))

1140


In [14]:
interactions_from_selected_users_df = interactions_df.loc[np.in1d(interactions_df.personId,
            users_with_enough_interactions_df)]
print(interactions_from_selected_users_df.shape)

(69868, 9)


In [15]:
import math

In [16]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = (
    interactions_from_selected_users_df
    .groupby(['personId', 'contentId']).eventStrength.sum()
    .apply(smooth_user_preference)
    .reset_index().set_index(['personId', 'contentId'])
)
interactions_full_df['last_timestamp'] = (
    interactions_from_selected_users_df
    .groupby(['personId', 'contentId'])['timestamp'].max()
)
        
interactions_full_df = interactions_full_df.reset_index()
interactions_full_df['last_timestamp'].mean()

1470605340.0403006

In [17]:
from sklearn.model_selection import train_test_split

split_ts = 1475519545
interactions_train_df = interactions_full_df.loc[interactions_full_df.last_timestamp < split_ts].copy()
interactions_test_df = interactions_full_df.loc[interactions_full_df.last_timestamp >= split_ts].copy()

print(len(interactions_train_df))

29325


In [18]:
final_df = (
    interactions_train_df.reset_index()
    .groupby('personId')['contentId'].agg(lambda x: list(x))
    .reset_index()
    .rename(columns={'contentId': 'true_train'})
    .set_index('personId')
)

final_df['true_test'] = (
    interactions_test_df.reset_index()
    .groupby('personId')['contentId'].agg(lambda x: list(x))
)

final_df['true_test'] = [ [] if x is np.NaN else x for x in final_df['true_test'] ]
final_df.head()

Unnamed: 0_level_0,true_train,true_test
personId,Unnamed: 1_level_1,Unnamed: 2_level_1
-1007001694607905623,"[-5065077552540450930, -793729620925729327]","[-6623581327558800021, 1469580151036142903, 72..."
-1032019229384696495,"[-1006791494035379303, -1039912738963181810, -...","[-1415040208471067980, -2555801390963402198, -..."
-108842214936804958,"[-1196068832249300490, -133139342397538859, -1...","[-2780168264183400543, -3060116862184714437, -..."
-1130272294246983140,"[-1150591229250318592, -1196068832249300490, -...","[-1606980109000976010, -1663441888197894674, -..."
-1160159014793528221,"[-133139342397538859, -387651900461462767, 377...",[-3462051751080362224]


In [19]:
popular = (
    interactions_train_df
    .groupby('contentId')
    .eventStrength.sum().reset_index()
    .sort_values('eventStrength', ascending=False)
    .contentId.values
)
popular[0]

'-6783772548752091658'

In [20]:
top_k = 10
 
final_df['popular'] = (
    final_df.true_train
    .apply(
        lambda x:
        popular[~np.in1d(popular, x)][:top_k]
    )
)
def calc_precision(column):
    return (
        final_df
        .apply(
            lambda row:
            len(set(row['true_test']).intersection(
                set(row[column]))) /
            min(len(row['true_test']) + 0.001, 10.0),
            axis=1)).mean()
calc_precision('popular')

0.006454207722621084