In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import json
import nltk
import string
import urllib
from pprint import pprint
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.wsd import lesk
#from nltk.parse import CoreNLPParser
from nltk.corpus import stopwords
#from nltk.parse.corenlp import CoreNLPDependencyParser

In [None]:
stop_words = stopwords.words('english') + list(string.punctuation)
#print(stop_words)
wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [4]:
def Tokenization(sentence):
    tokens = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words]
    return tokens

In [5]:
# Obtains hypernyms
def WordNetHypernyms(sentence, word_tokens):
    # Creates dictionary for hypernyms
    hypernyms_list = []
    
    # Populates the above dictionaries according to the word senses associated with them
    for token in word_tokens:
        # Extracts best sense for each word using LESK
        best_sense = lesk(sentence, token)
        # token_synset = wn.synset(token)
        # hypernyms_list.append(token_synset.hypernyms()[0].lemmas()[0].name())
        
        if best_sense is not None:
            # Obtains Hypernyms
            if best_sense.hypernyms() != []:
                hypernyms_list.append(best_sense.hypernyms()[0].lemmas()[0].name())
            
    return hypernyms_list

In [6]:
def NLP_Pipeline(sentence):
    word_tokens = Tokenization(sentence)
    hypernyms = WordNetHypernyms(sentence, word_tokens)
    return hypernyms

In [7]:
# sample_text = "A soldier-of-fortune steals some Russian nerve gas from Afghanistan, and brings it to the U.S. to be analyzed. A greedy millionaire rancher finds out about it and sets out to steal it."
# print(sample_text)
# print(NLP_Pipeline(sample_text))

A soldier-of-fortune steals some Russian nerve gas from Afghanistan, and brings it to the U.S. to be analyzed. A greedy millionaire rancher finds out about it and sets out to steal it.
['gain', 'native', 'brace', 'fossil_fuel', 'change', 'rich_person', 'farmer', 'pronounce', 'representation', 'gain']


In [7]:
movie_list = ['2012', 'A Beautiful Mind', 'Amadeus', 'Avatar', 'Clash of the Titans', 'Les Miserables', 'Star Wars', 'The Expendables', 'The Godfather', 'The Matrix Revolutions']

In [8]:
counts_dict = {}
movies_dict = {}

In [13]:
for i in movie_list:
    folder_name = 'Data/' + i
    review_content = ''
    hypernyms_list = []
    for num in range(1, 100):
        #print('read file: ' + str(num))
        file_name = folder_name + "/" + str(num) + ".txt"
        with open(file_name) as f:
            file_contents = f.read()
            review_content += file_contents
    hypernyms_list = NLP_Pipeline(review_content)
    movies_dict[i] = hypernyms_list
    for hypernym in hypernyms_list:
        #print(hypernym)
        counts_dict[hypernym] = counts_dict.get(hypernym, 0) + 1


In [14]:
counts_dict_sorted = sorted(counts_dict, key=counts_dict.get, reverse=True)
top_100_counts = counts_dict_sorted[0:100]
print(len(top_100_counts))

100


In [16]:
movies_concepts_array = np.zeros(shape=(len(movies_dict), len(top_100_counts)))
row_number = -1
for movie in movies_dict:
    row_number = row_number + 1
    col_number = -1
    concepts_list = movies_dict[movie]
    for concept in top_100_counts:
        col_number = col_number + 1
        if concept in concepts_list:
            movies_concepts_array[row_number][col_number] = 1

In [17]:
from scipy import spatial

def cosine_similarity(movie_num_1, movie_num_2):
    result = 1- spatial.distance.cosine(movies_concepts_array[movie_num_1], movies_concepts_array[movie_num_2])
    return result

In [19]:
ratings_matrix = np.zeros(shape=(len(movies_dict), 100))

for i in range(len(movie_list)):
    file_name = 'Data/' + movie_list[i] + '/rating.txt'
    with open(file_name) as f:
        file_contents = f.read()
        file_contents = file_contents.split(' = ')
        file_contents = file_contents[1].split(' , ')
        for j in range(100):
            if j == 0:
                temp = file_contents[j].split(' ')
                ratings_matrix[i][j] = temp[1]
            elif j == 99:
                temp = file_contents[j].split(' ')
                ratings_matrix[i][j] = temp[0]
            else:
                ratings_matrix[i][j] = file_contents[j]

In [20]:
def ratings_prediction(movie_name, user_number):
    if movie_name not in movie_list:
        return None
    if user_number > len(ratings_matrix[0]):
        return None
    
    for i in range(len(movie_list)):
        if movie_name == movie_list[i]:
            movie_number = i

    pred_rating = 0

    for i in range(len(movie_list)):
        if movie_list[i] == movie_name:
            continue
        else:
            #print(str(cosine_similarity(movie_number, i)) + "\t" + str(ratings_matrix[i][user_number - 1]))
            pred_rating = pred_rating + cosine_similarity(movie_number, i) * ratings_matrix[i][user_number - 1]
    
    return float(pred_rating/ (len(movie_list)-1))

In [25]:
# VERY HIGH SIMILARITY VALUE
movie_id_1 = 1              # A Beautiful Mind
movie_id_2 = 7              # The Expendables
print(cosine_similarity(1, 7))

0.9794334923752579


In [None]:
ratings_prediction('2012', 5)