In [36]:
import re
import math
import json
import numpy as np
import pandas as pd
import pickle
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import lyricsgenius
genius_TOKEN = 'sD0C3epnJdfOQQK4eIC45dHl-Qv7DipToGpuj1n4WeuG5_LDP1HKn31w5Cn1lOux'
genius = lyricsgenius.Genius(genius_TOKEN)
genius.verbose = False
genius.remove_section_headers = True

In [3]:
movies = pd.read_csv('merged_data.csv')

In [4]:
def listify(df):
    genres = []
    languages = []
    countries = []
    for x,y,z in zip(df['Genres'],df['Languages'],df['Countries']):
        g = re.findall(': \"(.*?)\"', x)
        l = re.findall(': \"(.*?)\"', y)
        c = re.findall(': \"(.*?)\"', z)
        genres.append(g)
        languages.append(l)
        countries.append(c)
    df['Genres'] = genres
    df['Languages'] = languages
    df['Countries'] = countries

In [5]:
listify(movies)

In [6]:
vectorizer = TfidfVectorizer(max_features=50000, max_df=0.8, min_df=20, norm='l2')
tokenizer = vectorizer.build_tokenizer()

In [7]:
movies['toks'] = [tokenizer(summary) for summary in movies['Summary']]

In [8]:
#is_english = ['English Language' in m for m in movies['Languages']]
#movies = movies[is_english]

In [9]:
'''years = []
for x in movies['ReleaseDate']:
    x = str(x)[:4]
    if x != 'nan':
        years.append(int(x))
    else:
        years.append(3000)
movies['ReleaseDate'] = years'''

"years = []\nfor x in movies['ReleaseDate']:\n    x = str(x)[:4]\n    if x != 'nan':\n        years.append(int(x))\n    else:\n        years.append(3000)\nmovies['ReleaseDate'] = years"

In [10]:
#movies = movies[movies['ReleaseDate']>=1960]  

In [11]:
#movies = movies.reset_index()
#movies = movies.drop(['index'], axis=1)

In [12]:
movies.head()

Unnamed: 0,WikiID,Title,ReleaseDate,Runtime,Languages,Countries,Genres,Summary,compound,pos,neg,neu,toks
0,975900,Ghosts of Mars,2001-08-24,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th...",-0.9913,0.065,0.15,0.786,"[Set, in, the, second, half, of, the, 22nd, ce..."
1,9363483,White Of The Eye,1987,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",A series of murders of rich young women throug...,-0.9985,0.078,0.198,0.724,"[series, of, murders, of, rich, young, women, ..."
2,261236,A Woman in Flames,1983,106.0,[German Language],[Germany],[Drama],"Eva, an upper class housewife, becomes frustra...",0.9604,0.1,0.065,0.835,"[Eva, an, upper, class, housewife, becomes, fr..."
3,18998739,The Sorcerer's Apprentice,2002,86.0,[English Language],[South Africa],"[Family Film, Fantasy, Adventure, World cinema]","Every hundred years, the evil Morgana returns...",-0.8885,0.092,0.131,0.776,"[Every, hundred, years, the, evil, Morgana, re..."
4,6631279,Little city,1997-04-04,93.0,[English Language],[United States of America],"[Romantic comedy, Ensemble Film, Comedy-drama,...","Adam, a San Francisco-based artist who works a...",-0.7097,0.089,0.108,0.804,"[Adam, San, Francisco, based, artist, who, wor..."


In [13]:
num_movies = len(movies)
num_movies

42204

In [14]:
def build_inverted_index(movies):
    word_set = []
    for t in movies['toks']:
        word_set+=t
    word_set = set(word_set)
    word_dict = {w: [] for w in word_set}
    for i in range(len(movies)):
        for w in set(movies['toks'][i]):
            if w in movies['toks'][i]:
                word_dict[w].append((i,movies['toks'][i].count(w)))
    return word_dict

In [15]:
#took long time to run, only needs to be done once
#inv_idx = build_inverted_index(movies)
#np.save('inv_idx.npy', inv_idx) 

In [16]:
inv_idx = np.load('inv_idx.npy',allow_pickle='TRUE').item()

In [37]:
with open('inv_idx.pkl', 'wb') as f:
    pickle.dump(inv_idx, f, pickle.HIGHEST_PROTOCOL)

In [39]:
with open('inv_idx.pkl', 'rb') as f:
     d = pickle.load(f)

In [32]:
with open('inv_idx.txt', 'w') as file:
    json.dump(inv_idx, file)

In [33]:
with open('inv_idx.txt', 'r') as file:
    new_d = json.load(file)

In [17]:
def compute_idf(inv_idx, n_docs, min_df=20, max_df_ratio=0.8):
    idf = {x: math.log2(n_docs/(1+len(inv_idx[x]))) for x in inv_idx 
           if len(inv_idx[x])>=min_df and len(inv_idx[x])/n_docs<=max_df_ratio}
    return idf

In [18]:
idf = compute_idf(inv_idx,num_movies)

In [19]:
def compute_doc_norms(index, idf, n_docs):
    norms_sq = np.zeros(n_docs)
    for t in idf:
        for (doc,cnt) in index[t]:
            norms_sq[doc] += (cnt*idf[t])**2
    return np.sqrt(norms_sq)

In [20]:
#norms = compute_doc_norms(inv_idx, idf, num_movies)
#np.savetxt('norms.csv', norms, delimiter=',')

In [21]:
norms = np.loadtxt('norms.csv', delimiter=',')

In [22]:
def cleanjson(result):
    title = result[result.find("Title")+8:result.find("Year")-3]
    plot = result[result.find("Plot")+7:result.find("Language")-3]
    try:
        review_imdb = float(result[result.find(
        '"Internet Movie Database","Value":"')+35:result.find('Source":"Rotten Tomatoes"')-8])
    except:
        review_imdb = -1
    try:
        review_rotten = float(result[result.find(
        'Source":"Rotten Tomatoes","Value":')+35: result.find('},{"Source":"Metacritic"')-2])
    except:
        review_rotten = -1
    return [title, plot, review_imdb, review_rotten]

def response(result):
    text = result[result.find('"Response":' )+12:]
    return text.find('True') >-1

In [23]:
omdb_TOKEN = 'ce887dbd'
def find_movie(movie):
    title = ""
    plot = ""
    query = "http://www.omdbapi.com/?apikey=" + omdb_TOKEN + "&t=" + movie
    params = {"r": "json", "plot": "full"}
    result = requests.get(query, params)
    if response(result.text):
        json = cleanjson(result.text)
        plot = json[1]
        title = json[0]
        review_imdb = json[2]
        review_rotten = json[3]
    else:
        return "ERROR"
    return (title, plot, review_imdb, review_rotten)

In [24]:
#id_to_index = {wikiid:i for i, wikiid in enumerate(movies['WikiID'])}
#title_to_id = {t:i for t, i in zip(movies['Title'], movies['WikiID'])}
#id_to_title = {v:k for k,v in title_to_id.items()}
#title_to_index = {t:i for i, t in enumerate(movies['Title'])}
#index_to_title = {v:k for k,v in title_to_index.items()}
#movie_titles = [t for t in movies['Title']]

In [25]:
def index_search(user_mov):
    movie = find_movie(user_mov)
    query = movie[1]
    #query = ''
    #if movie[0] in movie_titles:
    #    query = movies['Summary'][title_to_index[movie[0]]]
    #else:
    #    query = movie[1]
    scores = np.zeros(len(norms))
    docs = [i for i in range(len(norms))]
    q = query.lower()
    q_tokens = tokenizer(q)
    q_norm_sq = 0
    for t in set(q_tokens):
        if t in idf:
            q_norm_sq += (q_tokens.count(t)*idf[t])**2
            for (doc,cnt) in inv_idx[t]:
                scores[doc] += (q_tokens.count(t)*cnt*idf[t]**2)/norms[doc]
    q_norm = math.sqrt(q_norm_sq)
    new_scores = [score/q_norm for score in scores]
    result = sorted(tuple(zip(new_scores, docs)),reverse=True)
    return result[:10]

In [26]:
results = index_search('Memento')

In [27]:
results

[(0.16056939582947824, 35155),
 (0.15718873462135707, 13330),
 (0.15580892217745576, 37607),
 (0.1485379138540403, 2487),
 (0.14042652859711635, 34573),
 (0.13924014073325505, 17758),
 (0.1351215565899785, 15566),
 (0.1330696150046359, 3533),
 (0.13249725484059963, 41117),
 (0.12548895093423626, 2898)]

In [28]:
def get_10(movie,results):
    ten = []
    for sim, ind in results[:10]:
        if movies['Title'][ind] != movie:
            s = movies['Title'][ind]+' (score: '+str(sim)+')'
            ten.append(s)
    return ten

In [29]:
get_10('Memento',results)

['Frauds (score: 0.16056939582947824)',
 'Duel in the Jungle (score: 0.15718873462135707)',
 'The Fire Raisers (score: 0.15580892217745576)',
 'Cover Up (score: 0.1485379138540403)',
 "Killer's Carnival (score: 0.14042652859711635)",
 'Perdido por perdido (score: 0.13924014073325505)',
 'The Gentleman from Nowhere (score: 0.1351215565899785)',
 'Spring Fever (score: 0.1330696150046359)',
 'Adulterous Wife: Dizzy (score: 0.13249725484059963)',
 'Zombie Girl: The Movie (score: 0.12548895093423626)']

In [112]:
movies['Summary'][35155]

'The film tells the story of insurance investigator Roland Copping and how he interferes with and manipulates the lives of others with outrageous games and gimmicks. Eventually he becomes involved in an escalating vendetta with a couple who make an unusual insurance claim.http://www.imdb.com/title/tt0106963/{{Cite web}}'

In [31]:
doc_by_vocab = vectorizer.fit_transform([s for s in movies['Summary']]).toarray()

In [33]:
def get_summary(movie, data):
    wiki_id = title_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    summary = data['Summary'][ind]
    return summary

In [40]:
m=find_movie('Memento')
m[1]

'Memento chronicles two separate stories of Leonard, an ex-insurance investigator who can no longer build new memories, as he attempts to find the murderer of his wife, which is the last thing he remembers. One story line moves forward in time while the other tells the story backwards revealing more each time.'

In [41]:
get_summary('Memento', movies)

"Undercover San Francisco narcotics cops Sean Kane  and Dave Pierce  head into a dark alley to meet up with an informant by the name of Tony Montoya  who promises to break their big investigation wide open by providing the name of the oriental drug ringleader. Minutes later, Pierce is dead after having been shot, hit by a car, and burned. Kane gets into trouble with his boss, Captain Stevens , for sending one of the killers flying out a third story window to his death in full public view right after the incident. Rather than face discipline, and told to keep his distance by his superiors, Kane now decides to quit the force, and sets out to exact vengeance. Kane is not the only one who is angry; Dave’s girlfriend, reporter Linda Chan , is too, and she vows to bring the drug gang down herself by way of investigative reporting and public exposure. However, when Linda uncovers the secret that Kane and Pierce never found, she, too, is killed. Now Kane is not just angry; he is furious, and s

In [42]:
a = vectorizer.transform(['hello this is a sentence about a woman who fell in love with a prince'])
print(a)
print((a.toarray()[0]))

  (0, 13190)	0.22791152965841327
  (0, 10649)	0.44582976615182407
  (0, 9162)	0.42875427438895203
  (0, 7209)	0.2135558199038125
  (0, 5731)	0.5826518761999901
  (0, 4721)	0.4247029896301851
[0. 0. 0. ... 0. 0. 0.]


In [43]:
len(vectorizer.get_feature_names())

13328

In [44]:
def get_sim(user_mov, other_mov):
    j = doc_by_vocab[title_to_index[other_mov]]
    result = find_movie(user_mov)
    if result[0] in movie_titles:
        i = doc_by_vocab[title_to_index[result[0]]]
    else:
        i = vectorizer.transform([result[1]]).toarray()[0]
    cosim = np.dot(i,j) / (np.linalg.norm(i)*np.linalg.norm(j))
    return cosim

In [45]:
'Parasite' in movie_titles

False

In [46]:
get_sim('Parasite','The Prestige')

0.00903916323571109

In [47]:
def build_movie_sims_cos(n_mov):
    movie_sims = np.ones((n_mov,n_mov))
    for i in range(n_mov):
        for j in range(i):
            sim = get_sim(index_to_title[i],index_to_title[j])
            movie_sims[i][j] = sim
            movie_sims[j][i] = sim
    return movie_sims

In [48]:
build_movie_sims_cos(num_movies)

KeyError: 2

In [45]:
def compare_sim(user_mov):
    sims = np.zeros(num_movies)
    for movie in movie_titles[:100]:
        ind = title_to_index[movie]
        sims[ind] = get_sim(user_mov, movie)
    return sims

In [46]:
compare_sim('Memento')

array([0.01187609, 0.0151769 , 0.01193732, ..., 0.        , 0.        ,
       0.        ])

In [48]:
tokenizer('hello i am a kat doing this stupid fucking project')

['hello', 'am', 'kat', 'doing', 'this', 'stupid', 'fucking', 'project']