In [34]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests

In [15]:
merged = pd.read_csv('merged_kmeans.csv', index_col = 'Title')

In [16]:
movie_data = pd.read_csv('merged_data.csv') 
movie_data = movie_data[['Title','WikiID','Genres', 'Summary','pos','neg', 'compound', 'neu']]

movie_to_id = {}
for t, title in enumerate(movie_data['Title']):
    wiki = movie_data['WikiID'][t]
    movie_to_id[title] = wiki

id_to_index = {}
for i, wikiid in enumerate(movie_data['WikiID']):
    id_to_index[wikiid] = i

def get_genres(movie):
    wiki_id = movie_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    genre = movie_data['Genres'][ind]
    return genre

def get_summary(movie):
    wiki_id = movie_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    summary = movie_data['Summary'][ind]
    return summary

def get_pos(movie):
    wiki_id = movie_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    pos = movie_data['pos'][ind]
    return pos

def get_neg(movie):
    wiki_id = movie_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    neg = movie_data['neg'][ind]
    return neg

def get_compound(movie):
    wiki_id = movie_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    comp = movie_data['compound'][ind]
    return comp

def get_neu(movie):
    wiki_id = movie_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    neu = movie_data['neu'][ind]
    return neu
    

In [17]:
movie_to_id['Ghosts of Mars']

975900

In [5]:
 def get_sim(mov1, mov2):
#  need to fix so that the cosine similarity is calculated between the user input movie and the top250
    vectorizer = TfidfVectorizer()
    sum1 = get_summary(mov1)
    sum2 = get_summary(mov2)
    doc = [sum1, sum2]
    tfidf = vectorizer.fit_transform(doc)
    return cosine_similarity(tfidf)[0][1]

In [6]:
get_sim('Ghosts of Mars', 'White Of The Eye')

0.4805725814816514

In [7]:
def compare_sim(mov1):
    cosine_sim = {}
    for title in movie_data['Title']:
        if title == mov1:
            value = 1
        else:
            value = get_sim(mov1, title)
        cosine_sim[title] = value
    return cosine_sim

In [8]:
compare_sim('Ghosts of Mars')

{'Ghosts of Mars': 1,
 'White Of The Eye': 0.4805725814816514,
 'A Woman in Flames': 0.5313727096641037,
 "The Sorcerer's Apprentice": 0.5218574261467509,
 'Little city': 0.35943418338138083,
 'Henry V': 0.6870457242669498,
 'Aaah Belinda': 0.41471452268139547,
 'The Mechanical Monsters': 0.7071802928991574,
 'Mary Poppins': 0.6833357642187593,
 'Die Fahne von Kriwoj Rog': 0.6522142523130374,
 'White on Rice': 0.17887899647107064,
 'Anbu Thozhi': 0.37423853118220257,
 'Baby Boy': 0.4069349562181145,
 'Bindiya Chamkegi': 0.2903529151337281,
 'Vandanam': 0.577643332712523,
 'Anokha Rishta': 0.5169684360604695,
 'Karayilekku Oru Kadal Dooram': 0.34162848296056675,
 'Siam Sunset': 0.3524951755960928,
 'Kausthubham': 0.38941583696443516,
 'Archie: To Riverdale and Back Again': 0.46390340914697537,
 'Troops': 0.5652188190142127,
 'Daddy and Them': 0.09683215394819661,
 'The Gods Must Be Crazy': 0.6672093901223372,
 'Rudo y Cursi': 0.5847254577328114,
 'Kinjite: Forbidden Subjects': 0.4752797

In [9]:
def compare_sum(movie):
    wiki_id1 = movie_to_id.get(movie)
    arr = np.zeros(len(movie_data))
    for wiki_id2 in movie_data['WikiID']:
        ind = id_to_index.get(wiki_id2)
        movie2 = movie_data['Title'][ind]
        if wiki_id2 == wiki_id1:
            value = 1
        else:
            value = get_sim(movie, movie2)
        arr[ind] = value
    return arr

In [11]:
compare_sum('Ghosts of Mars')

array([1.        , 0.48057258, 0.53137271, ..., 0.46422584, 0.61018311,
       0.42643514])

In [21]:
def get_top10(movie):
    top_movie = {}
    arr = compare_sum(movie)
    wiki_id1 = movie_to_id.get(movie)
    top_indices = np.argpartition(-arr, 11)
    list_of_ind = top_indices[:11]
    for ind in list_of_ind:
        title = movie_data['Title'][ind]
        wiki_id2 = movie_data['WikiID'][ind]
        if title != movie:
            top_movie[wiki_id2] = arr[ind]
    return top_movie
        
    

In [22]:
get_top10('Ghosts of Mars')

646009
Alive
180606
The Animatrix
442668
History of the World, Part I
1443093
John Carpenter's The Fog
11665498
REC
3833665
Rodan
410036
Dante's Peak
975900
Ghosts of Mars
3060756
Babel
16921881
Alive
920062
Special Bulletin


{646009: 0.7559182385684845,
 180606: 0.7970804333608187,
 442668: 0.7572619597789535,
 1443093: 0.7669777467682996,
 11665498: 0.7526978466519044,
 3833665: 0.7805833072157894,
 410036: 0.780196166996253,
 3060756: 0.7604826680624014,
 16921881: 0.7559182385684845,
 920062: 0.7716598043174381}

In [31]:
def sorted_top10(movie):
    top10 = get_top10(movie)
    top10 = sorted(top10.items(), key=lambda x: x[1], reverse=True)
    sorted10 = []
    for info in top10: 
        wiki = info[0]
        ind = id_to_index.get(wiki)
        title = movie_data['Title'][ind]
        sorted10.append(title)
    return sorted10
        

In [32]:
sorted_top10('Ghosts of Mars')

646009
Alive
180606
The Animatrix
442668
History of the World, Part I
1443093
John Carpenter's The Fog
11665498
REC
3833665
Rodan
410036
Dante's Peak
975900
Ghosts of Mars
3060756
Babel
16921881
Alive
920062
Special Bulletin


['The Animatrix',
 'Rodan',
 "Dante's Peak",
 'Special Bulletin',
 "John Carpenter's The Fog",
 'Babel',
 'History of the World, Part I',
 'Alive',
 'Alive',
 'REC']

In [3]:
def cleanjson(result):
    title = result[result.find("Title")+8:result.find("Year")-3]
    plot = result[result.find("Plot")+7:result.find("Language")-3]
    review_imdb = float(result[result.find('"Internet Movie Database","Value":"')+35:result.find('Source":"Rotten Tomatoes"')-8])
    review_rotten = float(result[result.find('Source":"Rotten Tomatoes","Value":')+35: result.find('},{"Source":"Metacritic"')-2])
    return [title,plot,review_imdb,review_rotten]

def response(result):
    text = result[result.find('"Response":' )+12:]
    return text.find('True') >-1
        



In [4]:
omdb_TOKEN = 'ce887dbd'
def find_movie(movie):
    output = []
    #find the movie plot summary using IMDB API
    query = "http://www.omdbapi.com/?apikey=" + omdb_TOKEN + "&t=" + movie
    #query1 = "http://www.omdbapi.com/?apikey=" + token + "&s=" + movie
    params = {"r": "json", "plot": "full"}
    result = requests.get(query, params)
    if response(result.text):
        json = cleanjson(result.text)
        plot = json[1]
        title = json[0]
        review_imdb = json[2]
        review_rotten = json[3]
        output.append('Plot of the movie entered : ' + str(plot))
    else:
        output.append("We did not find the movie you searched for. Did you spell it correctly?")




    return output

In [5]:
x = find_movie('Inception')
print(x)

["Plot of the movie entered : Dom Cobb is a skilled thief, the absolute best in the dangerous art of extraction, stealing valuable secrets from deep within the subconscious during the dream state, when the mind is at its most vulnerable. Cobb's rare ability has made him a coveted player in this treacherous new world of corporate espionage, but it has also made him an international fugitive and cost him everything he has ever loved. Now Cobb is being offered a chance at redemption. One last job could give him his life back but only if he can accomplish the impossible - inception. Instead of the perfect heist, Cobb and his team of specialists have to pull off the reverse: their task is not to steal an idea but to plant one. If they succeed, it could be the perfect crime. But no amount of careful planning or expertise can prepare the team for the dangerous enemy that seems to predict their every move. An enemy that only Cobb could have seen coming."]


In [22]:
def create_mov_to_id(data):
    movie_to_id = {}
    for t, title in enumerate(data['Title']):
        wiki = data['WikiID'][t]
        movie_to_id[title] = wiki
    return movie_to_id

def create_id_to_ind(data):
    id_to_index = {}
    for i, wikiid in enumerate(data['WikiID']):
        id_to_index[wikiid] = i
    return id_to_index

def get_summary(movie, data, movie_to_id, id_to_index):
    wiki_id = movie_to_id.get(movie)
    ind = id_to_index.get(wiki_id)
    summary = data['Summary'][ind]
    return summary

def sorted_top10(movie, data, movie_to_id, id_to_index):
    top10 = get_top10(movie, data, movie_to_id, id_to_index)
    top10 = sorted(top10.items(), key=lambda x: x[1], reverse=True)
    sorted10 = []
    for info in top10:
        wiki = info[0]
        ind = id_to_index.get(wiki)
        title = data['Title'][ind]
        sorted10.append(title)
    return sorted10

def get_top10(movie, data, movie_to_id, id_to_index):
    top_movie = {}
    name = movie[0]
    arr = compare_sim(movie, data, movie_to_id, id_to_index)
    wiki_id1 = -1
    if name in movie_to_id:
        wiki_id1 = movie_to_id.get(name)
    top_indices = np.argpartition(-arr, 11)
    list_of_ind = top_indices[:11]
    for ind in list_of_ind:
        title = data['Title'][ind]
        wiki_id2 = data['WikiID'][ind]
        if title != name:
            top_movie[wiki_id2] = arr[ind]
    return top_movie

def compare_sim(movie, data, movie_to_id, id_to_index):
    name = movie[0]
    plot = movie[1]
    wiki_id1 = movie_to_id.get(name)
    arr = np.zeros(len(data))
    for wiki_id2 in data['WikiID']:
        ind = id_to_index.get(wiki_id2)
        movie2 = data['Title'][ind]
        if wiki_id2 == wiki_id1:
            value = 1
        else:
            plot2 = get_summary(movie2, data, movie_to_id, id_to_index)
            value = get_sim(plot, plot2, data, movie_to_id, id_to_index)
        arr[ind] = value
    return arr

def get_sim(plot1, plot2, data, movie_to_id, id_to_index):
    doc = [plot1, plot2]
    tfidf = vectorizer.fit_transform(doc)
    return cosine_similarity(tfidf)[0][1]

In [23]:
def get_movie_cluster(label, movies):
    has_label = movies['k=5']==label
    filtered_movies = movies[has_label]
    return filtered_movies

In [26]:
filtered = get_movie_cluster(3, merged)
filtered = filtered.reset_index()
filtered = filtered.drop(['Unnamed: 0'], axis=1)

In [27]:
filtered.head()

Unnamed: 0,Title,WikiID,Genres,Summary,pos,neg,compound,neu,k=5
0,White on Rice,21926710,"['Romantic comedy', 'Romance Film', 'Comedy', ...",Jimmy ([[Hiroshi Watanabe loves dinosaurs and...,0.21,0.093,0.926,0.697,3
1,Anbu Thozhi,20604092,['Romance Film'],The film opens with a simpleton from a small v...,0.151,0.064,0.9796,0.785,3
2,Anokha Rishta,29528534,"['World cinema', 'Musical', 'Drama', 'Romantic...","Mary, an orphan lives in an orphanage run by n...",0.187,0.055,0.9987,0.758,3
3,Chandra Mukhi,29198000,"['Romance Film', 'Drama']","Chandra Mukhi , a princess of a heavenly kingd...",0.141,0.0,0.6597,0.859,3
4,Chasing Ghosts: Beyond the Arcade,12371532,"['Culture & Society', 'Biography', 'Documentary']","In the 1980s, video games were synonymous with...",0.146,0.0,0.9477,0.854,3


In [30]:
mov_id = create_mov_to_id(filtered)

In [31]:
id_ind = create_id_to_ind(filtered)

In [35]:
vectorizer = TfidfVectorizer()

In [36]:
sorted_top10('Inception', filtered, mov_id, id_ind)

['Hope Ranch',
 'Hibakusha',
 'Post Grad',
 'Read It and Weep',
 'Del rancho a la televisión',
 'Karyam nissaram',
 'Maaro',
 'Bug',
 'Americano',
 'Cherry Chérie',
 'Akkare Ninnoru Maran']