# Recommendation Systems

In [1]:
import pandas as pd
import scipy
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
movies = pd.read_csv('data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv('data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
ratings.drop('timestamp', axis = 1, inplace = True)
movies.drop('genres', axis = 1, inplace = True)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [7]:
df = pd.merge(ratings, movies, on = 'movieId')

In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [9]:
#item based
item_df = pd.pivot_table(df, index='title', columns = 'userId', values = 'rating')
user_df = pd.pivot_table(df, index = 'userId', columns = 'title', values = 'rating')

In [11]:
item_df.fillna(0).head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
item_df.shape

(9719, 610)

In [13]:
sparse_items = scipy.sparse.csr_matrix(item_df.fillna(0))
sparse_users = scipy.sparse.csr_matrix(user_df.fillna(0))

In [14]:
print(sparse_items)

  (0, 609)	4.0
  (1, 331)	4.0
  (2, 331)	3.5
  (2, 376)	3.5
  (3, 344)	5.0
  (4, 112)	3.0
  (4, 344)	5.0
  (5, 20)	1.5
  (6, 11)	5.0
  (6, 18)	2.0
  (6, 90)	2.0
  (6, 94)	3.0
  (6, 171)	4.0
  (6, 216)	4.0
  (6, 287)	3.0
  (6, 293)	1.0
  (6, 306)	3.5
  (6, 376)	3.5
  (6, 413)	3.0
  (6, 473)	1.0
  (6, 476)	3.5
  (6, 519)	4.0
  (6, 554)	5.0
  (6, 560)	4.5
  (6, 598)	2.0
  :	:
  (9717, 26)	5.0
  (9717, 41)	5.0
  (9717, 56)	2.0
  (9717, 67)	4.0
  (9717, 87)	3.5
  (9717, 140)	3.5
  (9717, 197)	2.0
  (9717, 214)	2.5
  (9717, 216)	2.0
  (9717, 220)	3.5
  (9717, 238)	3.0
  (9717, 281)	4.0
  (9717, 293)	4.0
  (9717, 306)	2.5
  (9717, 312)	1.0
  (9717, 413)	3.0
  (9717, 420)	3.0
  (9717, 447)	3.0
  (9717, 473)	3.0
  (9717, 476)	3.5
  (9717, 554)	3.0
  (9717, 560)	4.0
  (9717, 596)	3.0
  (9717, 598)	2.5
  (9718, 526)	1.0


In [29]:
recommender = pairwise_distances(sparse_items, metric = 'cosine')
u_recommender = pairwise_distances(sparse_users)

In [30]:
recommender

array([[0.        , 1.        , 1.        , ..., 0.67267316, 1.        ,
        1.        ],
       [1.        , 0.        , 0.29289322, ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.29289322, 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [0.67267316, 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        0.        ]])

In [31]:
rdf = pd.DataFrame(recommender, columns = item_df.index, index = item_df.index)
udf = pd.DataFrame(u_recommender, columns = user_df.index, index = user_df.index)

In [32]:
#udf.head()

In [33]:
rdf.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.858347,1.0,...,1.0,0.657945,0.456695,0.292893,1.0,1.0,0.860569,0.672673,1.0,1.0
'Hellboy': The Seeds of Creation (2004),1.0,0.0,0.292893,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Round Midnight (1986),1.0,0.292893,0.0,1.0,1.0,1.0,0.823223,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Salem's Lot (2004),1.0,1.0,1.0,0.0,0.142507,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
'Til There Was You (1997),1.0,1.0,1.0,0.142507,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
search = 'Die Hard'

In [35]:
movies.loc[movies['title'].str.contains(search), 'title']

138     Die Hard: With a Vengeance (1995)
793                       Die Hard (1988)
1053                    Die Hard 2 (1990)
6518         Live Free or Die Hard (2007)
8103       Good Day to Die Hard, A (2013)
Name: title, dtype: object

In [36]:
item_df.head(2)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,


In [37]:
item_df.loc['Live Free or Die Hard (2007)', :].mean()

3.40625

In [38]:
item_df.T['Live Free or Die Hard (2007)'].count()

32

In [39]:
def recommender(title, n):
    return pd.DataFrame(rdf[title].sort_values()[1:n])

In [40]:
recommender('Live Free or Die Hard (2007)', 5)

Unnamed: 0_level_0,Live Free or Die Hard (2007)
title,Unnamed: 1_level_1
Transformers (2007),0.464709
King Kong (2005),0.472095
Phone Booth (2002),0.489301
Indiana Jones and the Kingdom of the Crystal Skull (2008),0.492294


In [41]:
import numpy as np

In [48]:
recommender(rdf.columns[np.random.randint(0, rdf.shape[1])], 7)

Unnamed: 0_level_0,Bill & Ted's Excellent Adventure (1989)
title,Unnamed: 1_level_1
Bill & Ted's Bogus Journey (1991),0.391369
Wayne's World (1992),0.505636
"Goonies, The (1985)",0.513995
Unbreakable (2000),0.53478
Willow (1988),0.534802
Ferris Bueller's Day Off (1986),0.542534


In [51]:
rdf.loc[:, 'Die Hard: With a Vengeance (1995)'].nsmallest(10)

title
Die Hard: With a Vengeance (1995)    0.000000
True Lies (1994)                     0.341742
Speed (1994)                         0.365913
Cliffhanger (1993)                   0.377932
Ace Ventura: Pet Detective (1994)    0.381457
GoldenEye (1995)                     0.384206
Clear and Present Danger (1994)      0.399911
Fugitive, The (1993)                 0.408297
Batman (1989)                        0.413284
Outbreak (1995)                      0.423053
Name: Die Hard: With a Vengeance (1995), dtype: float64

In [49]:
def recommender_getter(title, n = 5):
        avg = item_df.loc[title, :].mean()
        num_ratings = item_df.T[title].count()
        recs = rdf[title].nsmallest(n)[1:n]
        print(f'{title} had {num_ratings} ratings \nwith an average rating of {avg: .4f}')
        return recs

In [50]:
recommender_getter('Grumpier Old Men (1995)', 10)

Grumpier Old Men (1995) had 52 ratings 
with an average rating of  3.2596


title
Grumpy Old Men (1993)                  0.552013
Striptease (1996)                      0.553216
Nutty Professor, The (1996)            0.554918
Twister (1996)                         0.563753
Father of the Bride Part II (1995)     0.582198
Broken Arrow (1996)                    0.589867
Bio-Dome (1996)                        0.592596
Truth About Cats & Dogs, The (1996)    0.596917
Sabrina (1995)                         0.597169
Name: Grumpier Old Men (1995), dtype: float64

In [52]:
from surprise import SVD, KNNBasic

In [53]:
from surprise import Dataset

In [55]:
data = Dataset.load_builtin('ml-100k')

In [72]:
data

<surprise.dataset.DatasetAutoFolds at 0x7fb72a34ea90>

In [56]:
algo = SVD()

In [57]:
trainset = data.build_full_trainset()

In [58]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb72a34e4d0>

In [63]:
algo.predict(111, 201, r_ui = 4)

Prediction(uid=111, iid=201, r_ui=4, est=3.52986, details={'was_impossible': False})

In [60]:
knn = KNNBasic()

In [61]:
knn.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fb749340810>

In [71]:
knn.predict(0,201)

Prediction(uid=0, iid=201, r_ui=None, est=3.52986, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

In [68]:
trainset.all_users()

range(0, 943)

##### Topic Models

In [108]:
from sklearn.decomposition import LatentDirichletAllocation, NMF

In [74]:
import pandas as pd

In [75]:
tweets = pd.read_csv('data/tweets.csv', index_col = 0)

In [77]:
tweets.head()

Unnamed: 0,retweets,tweets,user
0,15122,"As of today, there are 14 weeks until the midt...",HillaryClinton
1,2152,Congrats to @IndivisibleTeam volunteers around...,HillaryClinton
2,401,RT @latinovictoryus: 🙌🏼 This is a humanitarian...,HillaryClinton
3,1538,"Give them all a follow, a RT, or an email sign...",HillaryClinton
4,1619,After Justice Kennedy announced his retirement...,HillaryClinton


In [78]:
tweets.groupby('user').size()

user
HillaryClinton    3237
_yiannopoulos     3231
iamcardib         3087
thrashermag       3209
dtype: int64

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [89]:
cvect = CountVectorizer(stop_words = 'english', max_features = 4000, ngram_range = (1, 3))

In [90]:
Xvect = cvect.fit_transform(tweets['tweets'])

In [91]:
lda = LatentDirichletAllocation(n_components = 3)

In [92]:
lda.fit(Xvect)

LatentDirichletAllocation(n_components=3)

In [93]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [94]:
print_top_words(lda, cvect.get_feature_names(), 15)

Topic #0: https http today great video new thank day time amp episode big years year just
Topic #1: rt cardi https iamcardib like donald donald trump trump um yea amp um yea bartier bartier cardi offset
Topic #2: https hillary trump president people just make amp rt country vote america don news potus



In [95]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [96]:
pyLDAvis.sklearn.prepare(lda, Xvect, cvect)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [109]:
import nltk

In [98]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

In [99]:
emma

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [100]:
type(emma)

nltk.corpus.reader.util.StreamBackedCorpusView

In [103]:
np.array(emma).shape

(192427,)

In [105]:
emma = cvect.fit_transform(np.array(emma))

In [106]:
lda.fit_transform(emma)

array([[0.33333333, 0.33333333, 0.33333333],
       [0.16666786, 0.66666396, 0.16666818],
       [0.33333333, 0.33333333, 0.33333333],
       ...,
       [0.16689657, 0.16691434, 0.66618909],
       [0.33333333, 0.33333333, 0.33333333],
       [0.33333333, 0.33333333, 0.33333333]])

In [107]:
pyLDAvis.sklearn.prepare(lda, emma, cvect)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
