In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import ast 
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
links_small = pd.read_csv('links_small.csv')
md = pd.read_csv('movies_metadata.csv')
ratings = pd.read_csv('ratings_small.csv')


In [3]:
#credits.head()
#credits.iloc[0:3]
#credits['cast'].iloc[0:3]
#credits.iloc[:,0:2]

In [4]:
#credits.columns

In [5]:
#credits.shape

In [6]:
#credits.info()

In [7]:
#keywords.head()

In [8]:
#keywords.columns

In [9]:
#keywords.shape

In [10]:
#keywords.info()

In [11]:
#links_small.head()

In [12]:
#links_small.columns

In [13]:
#links_small.shape

In [14]:
#links_small.info()

In [15]:
#md.iloc[0:3].transpose()

In [16]:
#md.columns

In [17]:
#md.shape

In [18]:
#md.info()

In [19]:
#ratings.head()

In [20]:
#ratings.columns

In [21]:
#ratings.shape

In [22]:
#ratings.info()

In [23]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i[
    'name'] for i in x] if isinstance(x, list) else [])

In [24]:
# this is V
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')

# this is R
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')

# this is C
C = vote_averages.mean()
#C

In [25]:
m = vote_counts.quantile(0.95)
#m

In [26]:
# Pre-processing step for getting year from date by splliting it using '-'

md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(
    lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [27]:
qualified = md[(md['vote_count'] >= m) & 
               (md['vote_count'].notnull()) & 
               (md['vote_average'].notnull())][['title', 
                                                'year', 
                                                'vote_count', 
                                                'vote_average', 
                                                'popularity', 
                                                'genres']]

qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
#qualified.shape

In [28]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [29]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [30]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [31]:
#qualified.head(15)

In [32]:
'''
>>> s
     a   b
one  1.  2.
two  3.  4.

>>> s.stack()
one a    1
    b    2
two a    3
    b    4
'''
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)
gen_md.head(3).transpose()

Unnamed: 0,0,0.1,0.2
adult,False,False,False
belongs_to_collection,"{'id': 10194, 'name': 'Toy Story Collection', ...","{'id': 10194, 'name': 'Toy Story Collection', ...","{'id': 10194, 'name': 'Toy Story Collection', ..."
budget,30000000,30000000,30000000
homepage,http://toystory.disney.com/toy-story,http://toystory.disney.com/toy-story,http://toystory.disney.com/toy-story
id,862,862,862
imdb_id,tt0114709,tt0114709,tt0114709
original_language,en,en,en
original_title,Toy Story,Toy Story,Toy Story
overview,"Led by Woody, Andy's toys live happily in his ...","Led by Woody, Andy's toys live happily in his ...","Led by Woody, Andy's toys live happily in his ..."
popularity,21.9469,21.9469,21.9469


In [33]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & 
                   (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: 
                        (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C),
                        axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [34]:
#build_chart('Romance').head(15)

In [35]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [36]:
## Pre-processing step

def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [37]:
md['id'] = md['id'].apply(convert_int)
md[md['id'].isnull()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[Carousel Productions, Vision View Entertainme...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,NaT
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[Aniplex, GoHands, BROSTA TV, Mardock Scramble...","[{'iso_3166_1': 'US', 'name': 'United States o...",,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,,,,,,,,,,NaT
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[Odyssey Media, Pulser Productions, Rogue Stat...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,NaT


In [38]:
md = md.drop([19730, 29503, 35587])

In [39]:
md['id'] = md['id'].astype('int')

In [40]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [41]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [42]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [43]:
tfidf_matrix.shape

(9099, 268124)

In [44]:
# http://scikit-learn.org/stable/modules/metrics.html#linear-kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [45]:
cosine_sim[0]
#cosine_sim.shape

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [46]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])
#indices.head(2)

In [47]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [48]:
get_recommendations('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

In [49]:
#get_recommendations('The Dark Knight').head(10)

In [50]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [51]:
md.shape

(45463, 25)

In [52]:

md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [53]:
smd = md[md['id'].isin(links_small)]
smd.shape

# smd = md[md['id'].isin(links_small['tmdbId'])]
# smd.shape

(9219, 28)

In [54]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [55]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [56]:

smd['director'] = smd['crew'].apply(get_director)
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [57]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [58]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [59]:
s = s[s > 1]

In [60]:
# Just an example
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [61]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [62]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [63]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [64]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [65]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [66]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [67]:
get_recommendations('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
1134                Batman Returns
Name: title, dtype: object

In [68]:
#get_recommendations('Inception').head(10)

In [69]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & 
                       (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [70]:
improved_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.917588
8613,Interstellar,11187,8,2014,7.897107
6623,The Prestige,4510,8,2006,7.758148
3381,Memento,4168,8,2000,7.740175
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127
1134,Batman Returns,1706,6,1992,5.846862
132,Batman Forever,1529,5,1995,5.054144
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.013943
1260,Batman & Robin,1447,4,1997,4.287233


In [71]:
# surprise reader API to read the dataset
reader = Reader()

In [72]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [73]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8974
MAE:  0.6936
------------
Fold 2
RMSE: 0.8940
MAE:  0.6900
------------
Fold 3
RMSE: 0.8979
MAE:  0.6876
------------
Fold 4
RMSE: 0.8920
MAE:  0.6868
------------
Fold 5
RMSE: 0.8999
MAE:  0.6918
------------
------------
Mean RMSE: 0.8962
Mean MAE : 0.6900
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8974090685817337,
                             0.8940176458464397,
                             0.8978551347383917,
                             0.8920463578307506,
                             0.8998521102931768],
                            'mae': [0.6935640073140322,
                             0.6899531851448408,
                             0.6876303682297572,
                             0.6868174845695717,
                             0.6918273200785025]})

In [74]:
trainset = data.build_full_trainset()
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x41a7b95160>

In [75]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [76]:
svd.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.571752854409204, details={'was_impossible': False})

In [77]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [78]:
id_map = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [79]:
indices_map = id_map.set_index('id')

In [80]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'release_date', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(20)

In [81]:
hybrid(1, 'Avatar').head(10)

Unnamed: 0,title,vote_count,vote_average,release_date,id,est
522,Terminator 2: Judgment Day,4274.0,7.7,1991-07-01,280,3.359777
974,Aliens,3282.0,7.7,1986-07-18,679,3.153075
1011,The Terminator,4208.0,7.4,1984-10-26,218,3.039971
8401,Star Trek Into Darkness,4479.0,7.4,2013-05-05,54138,2.975997
2014,Fantastic Planet,140.0,7.6,1973-05-01,16306,2.913018
922,The Abyss,822.0,7.1,1989-08-09,2756,2.835316
4017,Hawk the Slayer,13.0,4.5,1980-08-27,25628,2.763096
1621,Darby O'Gill and the Little People,35.0,6.7,1959-06-29,18887,2.75876
8658,X-Men: Days of Future Past,6155.0,7.5,2014-05-15,127585,2.720457
1668,Return from Witch Mountain,38.0,5.6,1978-03-10,14822,2.706468


In [82]:
#hybrid(1, 'The Dark Knight')

In [83]:
#hybrid(1, 'Memento').head(10)

In [84]:
#hybrid(223, 'The Ladies Man').head(10)

In [85]:
#hybrid(1, 'The Ladies Man').head(10)

In [86]:
#nnaa = hybrid(1, 'The Ladies Man')
#nnaa.index

In [87]:
import tweepy
from textblob import TextBlob
consumer_key = '2AH9B3OwhvvQzxmgOWN24KL1l'
consumer_secret = 'Kf58P0jnLbT2oPcQTFgHgBf5587X5E7XlSDxZxaojWuqxxqfOY'
access_token = '2591316295-1P63E5F3QPi0kHimW76WSdlfyEu9aUsgVmHIFk1'
access_token_secret = 'R8nJTcneZCmEnwz0XAyCV59OAvEO8qFY0TkzrZx3elhbx'
auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)
api = tweepy.API(auth)
#public_tweets = api.search('#The Ladies Man')
#print(analysis.sentiment)

def frecom(userId, title):
    nnaa = hybrid(userId, title)
    for xx in nnaa.title:
        print("for "+str(xx)+"  ",end=" ")
        sm ="movie " + str(xx)
        public_tweets = api.search(sm)
        #print(public_tweets)
        #print(analysis.sentiment.polarity)
        x=0.0
        t=0
        for tweet in public_tweets:
            t+=1
            #print(tweet.text)
            analysis = TextBlob(tweet.text)
            x+=analysis.sentiment.polarity
            #print(analysis.sentiment)
        if(t>0):
            x=x/t
        print(x)
        rw = nnaa[nnaa['title']==xx].index.values.astype(int)[0]
        nnaa.set_value(rw, 'polarity', x)
    nnaa=nnaa.sort_values('polarity', ascending=False)
    return nnaa.head(10)    

In [88]:
#nnaa=nnaa.sort_values('polarity', ascending=False)
#nnaa.head(10)
frecom(1, 'The Ladies Man')

for Female Trouble   0.017291537827252113
for Mean Girls   -0.008008838383838379
for The Slumber Party Massacre   0.0
for Love & Other Drugs   0.27703373015873023
for Private Parts   0.15777777777777777
for Hot Tub Time Machine   0.24388888888888888
for Bad Girls   -0.07914682539682537
for Gun Shy   0.0
for American Pie Presents: Band Camp   0.125
for Calendar Girls   -0.03333333333333333
for Birthday Girl   0.36666666666666664
for House Party   0.10729166666666666
for National Lampoon’s Van Wilder   0.5
for Casanova   0.0446969696969697
for H.O.T.S.   0.19265873015873017
for Citizen's Band   0.0
for Hot Dog... The Movie   0.16964285714285715
for The Suburbans   0.0
for Sirens   0.029999999999999995
for Bordello of Blood   0.425


Unnamed: 0,title,vote_count,vote_average,release_date,id,est,polarity
4078,National Lampoon’s Van Wilder,867.0,5.9,2002-03-29,11452,2.58717,0.5
689,Bordello of Blood,63.0,5.0,1996-08-16,9431,2.413013,0.425
3961,Birthday Girl,104.0,6.1,2001-09-06,2084,2.660558,0.366667
7756,Love & Other Drugs,1268.0,6.6,2010-11-22,43347,2.773892,0.277034
7576,Hot Tub Time Machine,911.0,5.9,2010-03-26,23048,2.729186,0.243889
3057,H.O.T.S.,9.0,5.1,1979-05-01,59181,2.501545,0.192659
4780,Hot Dog... The Movie,24.0,4.4,1984-01-13,21989,2.458931,0.169643
1210,Private Parts,106.0,6.7,1997-03-07,9403,2.753126,0.157778
6334,American Pie Presents: Band Camp,553.0,5.3,2005-10-30,8274,2.67656,0.125
3034,House Party,52.0,6.1,1990-03-09,16094,2.609554,0.107292


In [89]:
#frecom(223, 'The Ladies Man')

In [90]:
#frecom(23,'Batman Begins')