In [1]:
import re
import csv
import math
import json
import numpy as np
import pandas as pd
import pickle
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import lyricsgenius

analyzer = SentimentIntensityAnalyzer()
vectorizer = TfidfVectorizer(max_features=50000, max_df=0.8, min_df=20, norm='l2')
tokenizer = vectorizer.build_tokenizer()

#omdb_TOKEN = 'ce887dbd'
omdb_TOKEN = '4292bf53' #I paid $1 to have higher daily limit of API calls
genius_TOKEN = 'sD0C3epnJdfOQQK4eIC45dHl-Qv7DipToGpuj1n4WeuG5_LDP1HKn31w5Cn1lOux'
genius = lyricsgenius.Genius(genius_TOKEN)
genius.verbose = False
genius.remove_section_headers = True

In [41]:
movies = pd.read_csv('merged_data.csv')

In [42]:
def listify(df):
    genres = []
    languages = []
    countries = []
    for x,y,z in zip(df['Genres'],df['Languages'],df['Countries']):
        g = re.findall(': \"(.*?)\"', x)
        l = re.findall(': \"(.*?)\"', y)
        c = re.findall(': \"(.*?)\"', z)
        genres.append(g)
        languages.append(l)
        countries.append(c)
    #df['Genres'] = genres
    df['Languages'] = languages
    df['Countries'] = countries

In [43]:
listify(movies)

In [44]:
is_english = ['English Language' in m for m in movies['Languages']]

In [46]:
movies = movies[is_english]

In [47]:
movies = movies.reset_index()
movies = movies.drop(['index'], axis=1)

In [48]:
imdb250 = pd.read_csv('imdb250.csv')
imdb250 = imdb250[['Title','imdbRating']]

In [49]:
title_to_index = {t:i for i, t in enumerate(movies['Title'])}

In [50]:
ratings = np.zeros(len(movies))
for mov,rate in tuple(zip(imdb250['Title'],imdb250['imdbRating'])):
    if mov in title_to_index:
        ind = title_to_index[mov]
        ratings[ind] = rate

In [51]:
movies['Rating'] = ratings

In [53]:
movies.to_csv (r'merged_data2.csv', index = False, header=True)

In [54]:
movies['toks'] = [tokenizer(summary) for summary in movies['Summary']]

In [7]:
'''years = []
for x in movies['ReleaseDate']:
    x = str(x)[:4]
    if x != 'nan':
        years.append(int(x))
    else:
        years.append(3000)
movies['ReleaseDate'] = years'''

#movies = movies[movies['ReleaseDate']>=1960]
#movies = movies.reset_index()
#movies = movies.drop(['index'], axis=1)

"years = []\nfor x in movies['ReleaseDate']:\n    x = str(x)[:4]\n    if x != 'nan':\n        years.append(int(x))\n    else:\n        years.append(3000)\nmovies['ReleaseDate'] = years"

In [56]:
movies.head()

Unnamed: 0,WikiID,Title,ReleaseDate,Runtime,Languages,Countries,Genres,Summary,compound,pos,neg,neu,Rating,toks
0,975900,Ghosts of Mars,2001-08-24,98.0,[English Language],[United States of America],"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th...",-0.9913,0.065,0.15,0.786,0.0,"[Set, in, the, second, half, of, the, 22nd, ce..."
1,9363483,White Of The Eye,1987,110.0,[English Language],[United Kingdom],"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...,-0.9985,0.078,0.198,0.724,0.0,"[series, of, murders, of, rich, young, women, ..."
2,18998739,The Sorcerer's Apprentice,2002,86.0,[English Language],[South Africa],"{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns...",-0.8885,0.092,0.131,0.776,0.0,"[Every, hundred, years, the, evil, Morgana, re..."
3,6631279,Little city,1997-04-04,93.0,[English Language],[United States of America],"{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a...",-0.7097,0.089,0.108,0.804,0.0,"[Adam, San, Francisco, based, artist, who, wor..."
4,171005,Henry V,1989-11-08,137.0,[English Language],[United Kingdom],"{""/m/04xvh5"": ""Costume drama"", ""/m/082gq"": ""Wa...",{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...,0.3182,0.065,0.032,0.903,0.0,"[Plot, dateAct, 1Act, 2Act, 3Act, 4Act, Finall..."


In [55]:
num_movies = len(movies)
num_movies

24775

In [75]:
def build_ivdidx_df(movies):
    word_set = []
    for t in movies['toks']:
        word_set+=t
    word_set = set(word_set)
    word_dict = {w: [[],[]] for w in word_set}
    for i in range(len(movies)):
        tokens = movies['toks'][i]
        for w in set(tokens):
            cw = tokens.count(w)
            word_dict[w][0].append(i)
            word_dict[w][1].append(cw)
    df = pd.DataFrame.from_dict(word_dict, orient='index')
    return df

In [76]:
def build_inverted_index(movies):
    word_set = []
    for t in movies['toks']:
        word_set+=t
    word_set = set(word_set)
    word_dict = {w: [] for w in word_set}
    for i in range(len(movies)):
        for w in set(movies['toks'][i]):
            if w in movies['toks'][i]:
                word_dict[w].append((i,movies['toks'][i].count(w)))
    return word_dict

In [77]:
#took long time to run, only needs to be done once
#inv_idx = build_inverted_index(movies)
#np.save('inv_idx.npy', inv_idx)

In [57]:
inv_idx = np.load('inv_idx.npy',allow_pickle='TRUE').item()

In [58]:
idf = {x: math.log2(num_movies/(1+len(inv_idx[x]))) for x in inv_idx if len(inv_idx[x])>=20 and len(inv_idx[x])/num_movies<=0.8}

In [59]:
def compute_doc_norms(index, idf, n_docs):
    norms_sq = np.zeros(n_docs)
    for t in idf:
        for (doc,cnt) in index[t]:
            norms_sq[doc] += (cnt*idf[t])**2
    return np.sqrt(norms_sq)

In [60]:
#norms = compute_doc_norms(inv_idx, idf, num_movies)
#np.savetxt('norms.csv', norms, delimiter=',')

In [61]:
norms = np.loadtxt('norms.csv', delimiter=',')

In [62]:
def cleanjson(result):
    title = result[result.find("Title")+8:result.find("Year")-3]
    plot = result[result.find("Plot")+7:result.find("Language")-3]
    try:
        review_imdb = float(result[result.find(
        '"Internet Movie Database","Value":"')+35:result.find('Source":"Rotten Tomatoes"')-8])
    except:
        review_imdb = 0
    try:
        review_rotten = float(result[result.find(
        'Source":"Rotten Tomatoes","Value":')+35: result.find('},{"Source":"Metacritic"')-2])
    except:
        review_rotten = 0
    return [title, plot, review_imdb, review_rotten]

def response(result):
    text = result[result.find('"Response":' )+12:]
    return text.find('True') >-1

def find_movie(movie):
    title = ""
    plot = ""
    query = "http://www.omdbapi.com/?apikey=" + omdb_TOKEN + "&t=" + movie
    params = {"r": "json", "plot": "full"}
    result = requests.get(query, params)
    if response(result.text):
        json = cleanjson(result.text)
        plot = json[1]
        title = json[0]
        review_imdb = json[2]
        review_rotten = json[3]
    else:
        return 'ERROR'
    return (title, plot, review_imdb, review_rotten)

In [63]:
#def get_sent_dist(p1, n1):
def get_sent_dist(comp):
    dists = []
    for c in movies['compound']:
        dists.append(abs(float(c)-comp))
    #for p2,n2 in tuple(zip(movies['pos'], movies['neg'])):
    #    dists.append(math.sqrt((p2 - p1)**2 + (n2 - n1)**2))
    dists = max(dists)*np.ones(len(dists))-dists
    return dists

In [64]:
x = genius.search_song('happier','bastille')
s = analyzer.polarity_scores(x.lyrics)
print(s)
pos = s['pos']
neg = s['neg']
comp = s['compound']
#dists1 = np.asarray(get_sent_dist(pos,neg))
dists1 = np.asarray(get_sent_dist(comp))
max(dists1)

{'neg': 0.036, 'neu': 0.768, 'pos': 0.196, 'compound': 0.9966}


1.9965000000000002

In [65]:
x = genius.search_song('warning','eminem')
s = analyzer.polarity_scores(x.lyrics)
print(s)
pos = s['pos']
neg = s['neg']
comp = s['compound']
#dists2 = np.asarray(get_sent_dist(pos,neg))
dists2 = np.asarray(get_sent_dist(comp))
max(dists2)

{'neg': 0.127, 'neu': 0.788, 'pos': 0.085, 'compound': -0.9868}


1.9867

In [29]:
q = "Dom Cobb is a skilled thief, the absolute best in the dangerous art of extraction, stealing valuable secrets from deep within the subconscious during the dream state, when the mind is at its most vulnerable. Cobb's rare ability has made him a coveted player in this treacherous new world of corporate espionage, but it has also made him an international fugitive and cost him everything he has ever loved. Now Cobb is being offered a chance at redemption. One last job could give him his life back but only if he can accomplish the impossible - inception. Instead of the perfect heist, Cobb and his team of specialists have to pull off the reverse: their task is not to steal an idea but to plant one. If they succeed, it could be the perfect crime. But no amount of careful planning or expertise can prepare the team for the dangerous enemy that seems to predict their every move. An enemy that only Cobb could have seen coming."

In [39]:
def get_scores(query,dists):
    scores = np.zeros(len(norms))
    docs = [i for i in range(len(norms))]
    q = query.lower()
    q_tokens = tokenizer(q)
    q_norm_sq = 0
    for t in set(q_tokens):
        if t in idf:
            q_norm_sq += (q_tokens.count(t)*idf[t])**2
            for (doc,cnt) in inv_idx[t]:
                scores[doc] += (q_tokens.count(t)*cnt*idf[t]**2)/norms[doc]
    q_norm = math.sqrt(q_norm_sq)
    scores = np.asarray([score/q_norm for score in scores])
    dists = np.asarray(dists)
    total_scores = (2*scores+.1*dists)
    #total_scores = (2.5*scores+1*dists)
    result = sorted(tuple(zip(total_scores, docs)),reverse=True)
    return result[:6]

In [50]:
m = find_movie('Memento')
m[1]

'R'

In [40]:
res1 = get_scores(q,dists1)
res2 = get_scores(q,dists2)

In [41]:
res1[:15]

[(0.47000657721327344, 34741),
 (0.40915635146129875, 31082),
 (0.3978284875412698, 3389),
 (0.3685231797157522, 35988),
 (0.367194166507382, 15134),
 (0.36618799397390317, 8473)]

In [42]:
res2[:15]

[(0.44853776594019956, 36267),
 (0.4239304336302482, 28374),
 (0.40701707225395467, 14441),
 (0.398165340861654, 4034),
 (0.39652776314841087, 21965),
 (0.3903999763151464, 23285)]

In [46]:
j = 4034
print(movies['Title'][j])
print(movies['Summary'][j])
print(movies['pos'][j])
print(movies['neg'][j])
print(movies['compound'][j])
print(dists1[j])
print(dists2[j])

The Thief and the Cobbler
 The film opens with a narrator describing a prosperous city called the Golden City, which is ruled by the sleepy King Nod and protected by three golden balls positioned atop its tallest minaret. According to a prophecy, the city would fall to a race of warlike, one-eyed monsters referred to as "one-eyes" should the balls be removed, and could only be saved by "the simplest soul with the smallest and simplest of things". Living in the city are a humble and good-hearted cobbler named Tack and a nameless, unsuccessful yet persistent thief. Both characters are mute and have no dialogue. When the thief tries his luck in Tack's house, the two scuffle and stumble onto the street, causing Tack's tacks to fall out onto the street while Zigzag, King Nod's Grand Vizier, walks through it. Zigzag steps on one of the tacks and orders Tack to be arrested while the thief escapes. Tack is brought before King Nod and his daughter, Princess Yum-Yum, who takes an instant liking 

In [57]:
def print_ten(results):
    ten = []
    i = 1
    for (score,ind) in results:
        if movies['Title'][ind] != movie:
            ten.append(str(i)+'.')
            ten.append(movies['Title'][ind])
            ten.append('Score: '+str(score))
            i+=1
    return ten

In [1]:
import csv
import math
import json
import numpy as np
import pandas as pd
import pickle
import requests
import lyricsgenius
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
genius_TOKEN = 'sD0C3epnJdfOQQK4eIC45dHl-Qv7DipToGpuj1n4WeuG5_LDP1HKn31w5Cn1lOux'
genius = lyricsgenius.Genius(genius_TOKEN)
genius.verbose = False
genius.remove_section_headers = True
omdb_TOKEN = 'ce887dbd'
analyzer = SentimentIntensityAnalyzer()
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, max_df=0.8, min_df=20, norm='l2')
tokenizer = vectorizer.build_tokenizer()

movies = pd.read_csv('merged_data.csv')
num_movies = len(movies)
norms = np.loadtxt('norms.csv', delimiter=',')
inv_idx = pd.read_csv('inv_idx.csv')
inv_idx.columns = ['word','docs','counts']
z = tuple(zip(inv_idx['word'],inv_idx['docs']))
idf = {a: math.log2(num_movies/(1+len(b))) for (a,b) in z if len(b)>=20 and len(b)/num_movies<=0.8}
word_to_index = {word:i for i, word in enumerate(inv_idx['word'])}
docs = [d.strip('[]').split(', ') for d in inv_idx['docs']]
inv_idx['docs'] = docs
counts = [c.strip('[]').split(', ') for c in inv_idx['counts']]
inv_idx['counts'] = counts

In [7]:
def find_music(artist, song=''):
    if song != '':
        result = genius.search_song(song, artist)
    else:
        result = genius.search_artist(artist, max_songs=3)
    return result

def find_movie(movie):
    title = ""
    plot = ""
    query = "http://www.omdbapi.com/?apikey=" + omdb_TOKEN + "&t=" + movie
    params = {"r": "json", "plot": "full"}
    result = requests.get(query, params)
    if response(result.text):
        json = cleanjson(result.text)
        plot = json[1]
        title = json[0]
        review_imdb = json[2]
        review_rotten = json[3]
    else:
        return "ERROR"
    return (title, plot, review_imdb, review_rotten)

def cleanjson(result):
    title = result[result.find("Title")+8:result.find("Year")-3]
    plot = result[result.find("Plot")+7:result.find("Language")-3]
    try:
        review_imdb = float(result[result.find(
        '"Internet Movie Database","Value":"')+35:result.find('Source":"Rotten Tomatoes"')-8])
    except:
        review_imdb = -1
    try:
        review_rotten = float(result[result.find(
        'Source":"Rotten Tomatoes","Value":')+35: result.find('},{"Source":"Metacritic"')-2])
    except:
        review_rotten = -1
    return [title, plot, review_imdb, review_rotten]

def response(result):
    text = result[result.find('"Response":')+12:]
    return text.find('True') > -1


In [8]:
def index_search(query):
    scores = np.zeros(len(norms))
    #docs = [i for i in range(len(norms))]
    q = query.lower()
    q_tokens = tokenizer(q)
    q_norm_sq = 0
    for t in set(q_tokens):
        if t in idf:
            ind = word_to_index[t]
            q_norm_sq += (q_tokens.count(t)*idf[t])**2
            for (doc,cnt) in tuple(zip(inv_idx['docs'][ind],inv_idx['counts'][ind])):
                #print(doc,cnt)
                doc = int(doc)
                cnt = int(cnt)
                scores[doc] += (q_tokens.count(t)*cnt*idf[t]**2)/norms[doc]
    q_norm = math.sqrt(q_norm_sq)
    new_scores = [score/q_norm for score in scores]
    #pos = [x for x in movies['pos']]
    #neg = [x for x in movies['neg']]
    #result = sorted(tuple(zip(new_scores,docs)),reverse=True)
    result = new_scores
    return result

In [9]:
def get_sent_dist(p1, n1):
    dist = []
    for p2,n2 in tuple(zip(movies['pos'], movies['neg'])):
        dist.append(math.sqrt((p2 - p1)**2 + (n2 - n1)**2))
    return dist

In [43]:
dists = get_sent_dist(pos,neg)

In [44]:
sum(dists)/len(dists)

0.0850561106504346

In [53]:
def get_10(movie,dists,cosims):
    docs = [i for i in range(len(norms))]
    scores = [10*float(c) - 5*float(d) for c,d in zip(cosims,dists)]
    results = sorted(tuple(zip(scores,docs)),reverse=True)
    ten = []
    i = 1
    for (score,ind) in results[:10]:
        if movies['Title'][ind] != movie:
            ten.append(str(i)+'.')
            ten.append(movies['Title'][ind])
            ten.append('Score: '+str(score))
            i+=1
    return ten

In [54]:
movie_result = find_movie('Memento')

In [55]:
cosims = index_search(movie_result[1])

In [56]:
max(cosims)

0.10714571971754802

In [57]:
get_10('Memento',dists,cosims)

['1.',
 'Adulterous Wife: Dizzy',
 'Score: 0.7238764082659017',
 '2.',
 'Spring Fever',
 'Score: 0.6902394545606649',
 '3.',
 'Desperately Seeking Brandi',
 'Score: 0.6808735904564845',
 '4.',
 'Duel in the Jungle',
 'Score: 0.6664263341539075',
 '5.',
 'Dunia Baru: The Movie',
 'Score: 0.6340529737971887',
 '6.',
 'Alias Jesse James',
 'Score: 0.5293770888334492',
 '7.',
 'Frauds',
 'Score: 0.5034846207761867',
 '8.',
 'Speedtrap',
 'Score: 0.472347576417701',
 '9.',
 'Komaram Bheem',
 'Score: 0.4677012323835123',
 '10.',
 "Mother Krause's Journey to Happiness",
 'Score: 0.4454675881395035']

In [58]:
find_movie('Adulterous Wife: Dizzy')[1]

'R'

In [35]:
import re
import csv
import math
import json
import numpy as np
import pandas as pd
import pickle
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
import lyricsgenius
genius_TOKEN = 'sD0C3epnJdfOQQK4eIC45dHl-Qv7DipToGpuj1n4WeuG5_LDP1HKn31w5Cn1lOux'
genius = lyricsgenius.Genius(genius_TOKEN)
genius.verbose = False
genius.remove_section_headers = True

In [37]:
movies = pd.read_csv('merged_data.csv')

In [38]:
def listify(df):
    genres = []
    languages = []
    countries = []
    for x,y,z in zip(df['Genres'],df['Languages'],df['Countries']):
        g = re.findall(': \"(.*?)\"', x)
        l = re.findall(': \"(.*?)\"', y)
        c = re.findall(': \"(.*?)\"', z)
        genres.append(g)
        languages.append(l)
        countries.append(c)
    df['Genres'] = genres
    df['Languages'] = languages
    df['Countries'] = countries
    return df

In [39]:
#movies = listify(movies)

In [55]:
'''years = []
for x in movies['ReleaseDate']:
    x = str(x)[:4]
    if x != 'nan':
        years.append(int(x))
    else:
        years.append(3000)
movies['ReleaseDate'] = years'''

In [44]:
#movies = movies[movies['ReleaseDate']>=1960]  

In [40]:
vectorizer = TfidfVectorizer(max_features=50000, max_df=0.8, min_df=20, norm='l2')
tokenizer = vectorizer.build_tokenizer()

In [41]:
movies['toks'] = [tokenizer(summary) for summary in movies['Summary']]

In [56]:
is_english = ['English Language' in m for m in movies['Languages']]
movies = movies[is_english]

In [58]:
movies = movies.reset_index()
movies = movies.drop(['index'], axis=1)

In [59]:
movies.head()

Unnamed: 0,WikiID,Title,ReleaseDate,Runtime,Languages,Countries,Genres,Summary,compound,pos,neg,neu,toks
0,975900,Ghosts of Mars,2001,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th...",-0.9913,0.065,0.15,0.786,"[Set, in, the, second, half, of, the, 22nd, ce..."
1,9363483,White Of The Eye,1987,110.0,[English Language],[United Kingdom],"[Thriller, Erotic thriller, Psychological thri...",A series of murders of rich young women throug...,-0.9985,0.078,0.198,0.724,"[series, of, murders, of, rich, young, women, ..."
2,18998739,The Sorcerer's Apprentice,2002,86.0,[English Language],[South Africa],"[Family Film, Fantasy, Adventure, World cinema]","Every hundred years, the evil Morgana returns...",-0.8885,0.092,0.131,0.776,"[Every, hundred, years, the, evil, Morgana, re..."
3,6631279,Little city,1997,93.0,[English Language],[United States of America],"[Romantic comedy, Ensemble Film, Comedy-drama,...","Adam, a San Francisco-based artist who works a...",-0.7097,0.089,0.108,0.804,"[Adam, San, Francisco, based, artist, who, wor..."
4,171005,Henry V,1989,137.0,[English Language],[United Kingdom],"[Costume drama, War film, Epic, Period piece, ...",{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...,0.3182,0.065,0.032,0.903,"[Plot, dateAct, 1Act, 2Act, 3Act, 4Act, Finall..."


In [60]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [61]:
s = genius.search_song('XO','eden')
x = analyzer.polarity_scores(s.lyrics)
x1 = x['pos']
y1 = x['neg']

In [62]:
dist = []
for x2,y2 in tuple(zip(movies['pos'], movies['neg'])):
    dist.append(math.sqrt((x1 - x2)**2 + (y1 - y2)**2))

In [63]:
num_movies = len(movies)
num_movies

24775

In [65]:
def build_inverted_index(movies):
    word_set = []
    for t in movies['toks']:
        word_set+=t
    word_set = set(word_set)
    word_dict = {w: [] for w in word_set}
    for i in range(len(movies)):
        for w in set(movies['toks'][i]):
            if w in movies['toks'][i]:
                word_dict[w].append((i,movies['toks'][i].count(w)))
    return word_dict

In [66]:
#took long time to run, only needs to be done once
#idx = build_inverted_index(movies)

In [99]:
norms = np.loadtxt('norms.csv', delimiter=',')

In [114]:
inv_idx = pd.read_csv('inv_idx.csv')

In [115]:
inv_idx.columns = ['word','docs','counts']
inv_idx.head()

Unnamed: 0,word,docs,counts
0,Bacolod,[22137],[1]
1,cities,"[363, 711, 843, 1029, 1037, 1348, 1532, 1765, ...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,sympathising,[11878],[1]
3,Obliged,"[4784, 12949, 13936, 15556]","[1, 1, 1, 1]"
4,Nyla,"[4234, 15129, 25097, 39512]","[28, 1, 2, 2]"


In [116]:
docs = [d.strip('[]').split(', ') for d in inv_idx['docs']]
inv_idx['docs'] = docs

In [117]:
counts = [c.strip('[]').split(', ') for c in inv_idx['counts']]
inv_idx['counts'] = counts

In [119]:
type(inv_idx['docs'][72726])

str

In [96]:
word_to_index = {word:i for i, word in enumerate(inv_idx['word'])}

In [94]:
z = tuple(zip(inv_idx['word'],inv_idx['docs']))
idf = {a: math.log2(num_movies/(1+len(b))) for (a,b) in z if len(b)>=20 and len(b)/num_movies<=0.8}

In [120]:
def compute_doc_norms(index, idf, n_docs):
    norms_sq = np.zeros(n_docs)
    for t in idf:
        for (doc,cnt) in index[t]:
            norms_sq[doc] += (cnt*idf[t])**2
    return np.sqrt(norms_sq)

In [121]:
#norms = compute_doc_norms(inv_idx, idf, num_movies)
#np.savetxt('norms.csv', norms, delimiter=',')

In [122]:
def cleanjson(result):
    title = result[result.find("Title")+8:result.find("Year")-3]
    plot = result[result.find("Plot")+7:result.find("Language")-3]
    try:
        review_imdb = float(result[result.find(
        '"Internet Movie Database","Value":"')+35:result.find('Source":"Rotten Tomatoes"')-8])
    except:
        review_imdb = -1
    try:
        review_rotten = float(result[result.find(
        'Source":"Rotten Tomatoes","Value":')+35: result.find('},{"Source":"Metacritic"')-2])
    except:
        review_rotten = -1
    return [title, plot, review_imdb, review_rotten]

def response(result):
    text = result[result.find('"Response":' )+12:]
    return text.find('True') >-1

In [123]:
omdb_TOKEN = 'ce887dbd'
def find_movie(movie):
    title = ""
    plot = ""
    query = "http://www.omdbapi.com/?apikey=" + omdb_TOKEN + "&t=" + movie
    params = {"r": "json", "plot": "full"}
    result = requests.get(query, params)
    if response(result.text):
        json = cleanjson(result.text)
        plot = json[1]
        title = json[0]
        review_imdb = json[2]
        review_rotten = json[3]
    else:
        return "ERROR"
    return (title, plot, review_imdb, review_rotten)

In [124]:
def index_search(query):
    scores = np.zeros(len(norms))
    #docs = [i for i in range(len(norms))]
    q = query.lower()
    q_tokens = tokenizer(q)
    q_norm_sq = 0
    for t in set(q_tokens):
        if t in idf:
            ind = word_to_index[t]
            q_norm_sq += (q_tokens.count(t)*idf[t])**2
            for (doc,cnt) in tuple(zip(inv_idx['docs'][ind],inv_idx['counts'][ind])):
                #print(doc,cnt)
                doc = int(doc)
                cnt = int(cnt)
                scores[doc] += (q_tokens.count(t)*cnt*idf[t]**2)/norms[doc]
    q_norm = math.sqrt(q_norm_sq)
    new_scores = [score/q_norm for score in scores]
    #pos = [x for x in movies['pos']]
    #neg = [x for x in movies['neg']]
    #result = sorted(tuple(zip(new_scores,docs)),reverse=True)
    result = new_scores
    return result

In [125]:
def get_sent_dist(p1, n1):
    dist = []
    for p2,n2 in tuple(zip(movies['pos'], movies['neg'])):
        dist.append(math.sqrt((p2 - p1)**2 + (n2 - n1)**2))
    return dist

def get_10(movie,dists,cosims):
    docs = [i for i in range(len(norms))]
    scores = [1*float(c) - 100*float(d) for c,d in zip(dists,cosims)]
    results = sorted(tuple(zip(scores,docs)),reverse=True)
    ten = []
    i = 1
    for (score,ind) in results[:10]:
        if movies['Title'][ind] != movie:
            ten.append(str(i)+'.')
            ten.append(movies['Title'][ind])
            ten.append('Score: '+str(score))
            i+=1
    return ten

In [126]:
results = index_search('Memento')

ValueError: invalid literal for int() with base 10: '['

In [32]:
results

[(0.16056939582947827, 35155),
 (0.15718873462135707, 13330),
 (0.15580892217745576, 37607),
 (0.1485379138540403, 2487),
 (0.14042652859711635, 34573),
 (0.13924014073325508, 17758),
 (0.13512155658997846, 15566),
 (0.1330696150046359, 3533),
 (0.13249725484059963, 41117),
 (0.12548895093423626, 2898)]

In [28]:
def get_10(movie,results):
    ten = []
    for sim, ind in results[:10]:
        if movies['Title'][ind] != movie:
            s = movies['Title'][ind]+' (score: '+str(sim)+')'
            ten.append(s)
    return ten

In [29]:
get_10('Memento',results)

['Frauds (score: 0.16056939582947824)',
 'Duel in the Jungle (score: 0.15718873462135707)',
 'The Fire Raisers (score: 0.15580892217745576)',
 'Cover Up (score: 0.1485379138540403)',
 "Killer's Carnival (score: 0.14042652859711635)",
 'Perdido por perdido (score: 0.13924014073325505)',
 'The Gentleman from Nowhere (score: 0.1351215565899785)',
 'Spring Fever (score: 0.1330696150046359)',
 'Adulterous Wife: Dizzy (score: 0.13249725484059963)',
 'Zombie Girl: The Movie (score: 0.12548895093423626)']

In [1]:
movies['Title'][35155]

NameError: name 'movies' is not defined

In [24]:
#id_to_index = {wikiid:i for i, wikiid in enumerate(movies['WikiID'])}
#title_to_id = {t:i for t, i in zip(movies['Title'], movies['WikiID'])}
#id_to_title = {v:k for k,v in title_to_id.items()}
#title_to_index = {t:i for i, t in enumerate(movies['Title'])}
#index_to_title = {v:k for k,v in title_to_index.items()}
#movie_titles = [t for t in movies['Title']]

In [31]:
doc_by_vocab = vectorizer.fit_transform([s for s in movies['Summary']]).toarray()

In [33]:
def get_summary(movie, data):
    wiki_id = title_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    summary = data['Summary'][ind]
    return summary

In [44]:
def get_sim(user_mov, other_mov):
    j = doc_by_vocab[title_to_index[other_mov]]
    result = find_movie(user_mov)
    if result[0] in movie_titles:
        i = doc_by_vocab[title_to_index[result[0]]]
    else:
        i = vectorizer.transform([result[1]]).toarray()[0]
    cosim = np.dot(i,j) / (np.linalg.norm(i)*np.linalg.norm(j))
    return cosim