In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
##from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\Vivian\OneDrive\桌面\movies_metadata.csv\movies_metadata.csv", encoding = "ISO-8859-1")
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
# fillna('[]')
# apply(literal_eval) turn lsit or dict in string form into object
# [i['name' for i in x] if isinstance(x, list) else []]
#  加入['name'進 genres資訊的 dictionary
df['genres'] = df['genres'].fillna('[]')
df['genres'] = df['genres'].apply(literal_eval)
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
# 轉換成pd.date_time object, error以 coerce處理
# 以'-'分開來，只取年分
df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
df['year'] = pd.to_datetime(df['year'], errors='coerce')

In [5]:
# notnull()丟去vote_count中n/a的項目
# 取乾淨的資料轉換成整數
# ;對vote_avg做一樣的事情
# 取全局平均(C)
# m 門檻篩去vote_counts(0.95)的數量
vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
# 門檻篩去vote_counts(0.95)的數量
m = vote_counts.quantile(0.95)

In [6]:
# 條件：vote_counts >=m && vote_count/avg not_null
# 取title, year, vote_count, vote_avg, popularity, genres
# <The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().> try '&'
qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [7]:
# weighted_rating (wr)
# apply像是def函數，以row為單位傳入
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [8]:
qualified = qualified.sort_values('wr', ascending=False).head(1000)

In [9]:
# pd.Series將不同的種類genres建立不同的column，stack壓縮起來
# .reset_index(level=1, drop=True)更乾淨
# s替代genres項目
s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_df = df.drop('genres', axis=1).join(s)

# 將link_small 檔案中的tagline, overview, description 合併於sdf中
## 以下要以NLP做cosine_similarity

In [10]:
links_small = pd.read_csv(r"C:\Users\Vivian\OneDrive\桌面\movies_metadata.csv\links_small.csv", encoding = "ISO-8859-1")
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
links_small

0          862
1         8844
2        15602
3        31357
4        11862
         ...  
9120    402672
9121    315011
9122    391698
9123    137608
9124    410803
Name: tmdbId, Length: 9112, dtype: int32

In [11]:
# 這三rows錯誤，丟棄
df.loc[[19730, 29503, 35587]]
df = df.drop([19730, 29503, 35587])

In [12]:
# 將id整數化
# links_small中放的是tmbd的id，找出同時在兩邊的項目，留在sdf (link_small + df)
df['id'] = df['id'].astype('int')
sdf = df[df['id'].isin(links_small)]
sdf.shape

(9099, 25)

In [13]:
# overview 和 tagline 加起來做分析
sdf['tagline'] = sdf['tagline'].fillna('')
sdf['description'] = sdf['overview'] + sdf['tagline']
sdf['description'] = sdf['description'].fillna('')

In [14]:
# ngram代表幾個詞 ex.unigram, bigram, trigram, 4-grams
# stop-word 有 "the," "is," "and," "of," and "in."
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(sdf['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680459, 0.        , ..., 0.        , 0.00289435,
       0.        ])

In [15]:
# reset index
# smd.index為key, sdf['title']為value
# indices準備便於以後找movie_title
sdf = sdf.reset_index()
titles = sdf['title']
indices = pd.Series(sdf.index, index=sdf['title'])

In [16]:
# i[0]是為了取sim_scores中的電影名稱
# 取前5
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    title_to_info = list(titles.iloc[movie_indices].values)
    return sdf[sdf['title'].isin(title_to_info)][['title', 'genres', 'year', 'description']]
get_recommendations('The Godfather').head(10)

Unnamed: 0,title,genres,year,description
29,Shanghai Triad,"[Drama, Crime]",1995-01-01,A provincial boy related to a Shanghai crime f...
227,The Jerky Boys,"[Comedy, Crime]",1995-01-01,When two unemployed telephone pranksters decid...
618,Thinner,"[Horror, Thriller]",1996-01-01,"A fat Lawyer finds himself growing ""Thinner"" w..."
973,The Godfather: Part II,"[Drama, Crime]",1974-01-01,In the continuing saga of the Corleone crime f...
1582,The Godfather: Part III,"[Crime, Drama, Thriller]",1990-01-01,In the midst of trying to legitimize his busin...
2159,Summer of Sam,"[Thriller, Drama, Crime, Romance]",1999-01-01,"Spike Lee's take on the ""Son of Sam"" murders i..."
2192,The Color Purple,[Drama],1985-01-01,An epic tale spanning forty years in the life ...
2412,American Movie,[Documentary],1999-01-01,AMERICAN MOVIE is the story of filmmaker Mark ...
3288,Jaws: The Revenge,"[Adventure, Thriller]",1987-01-01,"After another deadly shark attack, Ellen Brody..."
3509,Made,"[Action, Comedy, Thriller]",2001-01-01,Two aspiring boxers lifelong friends get invol...


In [17]:
get_recommendations('The Dark Knight').head(10)

Unnamed: 0,title,genres,year,description
132,Batman Forever,"[Action, Crime, Fantasy]",1995-01-01,The Dark Knight of Gotham City confronts a das...
524,Batman,"[Fantasy, Action]",1989-01-01,The Dark Knight of Gotham City begins his war ...
1113,Batman Returns,"[Action, Fantasy]",1992-01-01,"Having defeated the Joker, Batman now faces th..."
1135,Night Falls on Manhattan,"[Drama, Crime]",1996-01-01,A newly elected District attorney finds himsel...
1240,Batman & Robin,"[Action, Crime, Fantasy]",1997-01-01,Along with crime-fighting partner Robin and ne...
1652,The Shaggy D.A.,"[Comedy, Family]",1976-01-01,"Wilby Daniels, a successful lawyer running for..."
2579,Batman: Mask of the Phantasm,"[Action, Adventure, Animation, Family]",1993-01-01,An old flame of Bruce Wayne's strolls into tow...
2696,JFK,"[Drama, Thriller, History]",1991-01-01,New Orleans District Attorney Jim Garrison dis...
2893,Flying Tigers,"[Action, Drama, History]",1942-01-01,Jim Gordon commands a unit of the famed Flying...
3537,Criminal Law,"[Drama, Thriller, Romance]",1988-01-01,A rising young attorney successfully defends a...


# To furthur imrove by casts and keywords
# combining 'credits.csv' and 'keywords.csv'
## this is another NLP

In [18]:
credits = pd.read_csv(r"C:\Users\Vivian\OneDrive\桌面\movies_metadata.csv\credits.csv")
keywords = pd.read_csv(r"C:\Users\Vivian\OneDrive\桌面\movies_metadata.csv\keywords.csv")

In [19]:
# 以id為標準，將同時存在三份{df, credits, keywords}資料的電影整合
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
df['id'] = df['id'].astype('int')
df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')
sdf = df[df['id'].isin(links_small)]
sdf.shape

(9219, 28)

In [20]:
sdf['cast'] = sdf['cast'].apply(literal_eval)
sdf['crew'] = sdf['crew'].apply(literal_eval)
sdf['keywords'] = sdf['keywords'].apply(literal_eval)
sdf['cast_size'] = sdf['cast'].apply(lambda x: len(x))
sdf['crew_size'] = sdf['crew'].apply(lambda x: len(x))

In [21]:
# 將director從crew中拉出來，特別列一個columns
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
sdf['director'] = sdf['crew'].apply(get_director)

In [22]:
# 如果電影有名字，並且cast中有名字，取名字
# 並且如果cast人數大於三，取前三位
sdf['cast'] = sdf['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
sdf['cast'] = sdf['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
sdf['keywords'] = sdf['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
sdf['cast'] = sdf['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
sdf['director'] = sdf['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
sdf['director'] = sdf['director'].apply(lambda x: [x,x, x])
s = sdf.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [23]:
# 
# add ['keywords', 'cast', 'director', 'genres'] to improved recc
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words
sdf['keywords'] = sdf['keywords'].apply(filter_keywords)
sdf['keywords'] = sdf['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
sdf['keywords'] = sdf['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
sdf['soup'] = sdf['keywords'] + sdf['cast'] + sdf['director'] + sdf['genres']
sdf['soup'] = sdf['soup'].apply(lambda x: ' '.join(x))

In [24]:
sdf['soup'] = sdf['keywords'] + sdf['cast'] + sdf['director'] + sdf['genres']
sdf['soup'] = sdf['soup'].apply(lambda x: ' '.join(x))

In [25]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(sdf['soup'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
sdf = sdf.reset_index()
titles = sdf['title']
indices = pd.Series(sdf.index, index=sdf['title'])

In [26]:
def improved_recommendations(title):
    # with new cosine recccc(with keywords, credit.csv and genres) to analyze
    # choose top 26 in it
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    # this is for qualified
    # sort by vote_count choose the top 0.6 movies
    movies = sdf.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    
    # recommended qualified, cosine_similarity by keywords , credits , genres
    # this is qualified parts
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [27]:
improved_recommendations('Toy Story')


Unnamed: 0,title,vote_count,vote_average,year,wr
3833,"Monsters, Inc.",6150,7,2001-01-01,6.884308
7629,Toy Story 3,4710,7,2010-01-01,6.851922
2522,Toy Story 2,3914,7,1999-01-01,6.824813
8595,The Lego Movie,3127,7,2014-01-01,6.786095
6496,Cars,3991,6,2006-01-01,5.92594
1883,A Bug's Life,2379,6,1998-01-01,5.8835
7404,Cloudy with a Chance of Meatballs,1799,6,2009-01-01,5.85324
1832,Antz,1320,6,1998-01-01,5.813161
3016,Chicken Run,1190,6,2000-01-01,5.798205
7914,Cars 2,2088,5,2011-01-01,5.042143


In [28]:
improved_recommendations('Mean Girls')


Unnamed: 0,title,vote_count,vote_average,year,wr
1547,The Breakfast Club,2189,7,1985-01-01,6.709602
390,Dazed and Confused,588,7,1993-01-01,6.254682
8883,The DUFF,1372,6,2015-01-01,5.818541
3712,The Princess Diaries,1063,6,2001-01-01,5.781086
4763,Freaky Friday,919,6,2003-01-01,5.757786
6277,Just Like Heaven,595,6,2005-01-01,5.681521
6959,The Spiderwick Chronicles,593,6,2008-01-01,5.680901
7494,American Pie Presents: The Book of Love,454,5,2009-01-01,5.11969
7332,Ghosts of Girlfriends Past,716,5,2009-01-01,5.092422
7905,Mr. Popper's Penguins,775,5,2011-01-01,5.087912
