In [1]:
############## MAKING ALL NECESSARY IMPORTS #################

In [2]:
import warnings
warnings.filterwarnings("ignore")
import csv
import math
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
import json
import nltk
import string
import urllib
from pprint import pprint
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.wsd import lesk
#from nltk.parse import CoreNLPParser
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split
#from nltk.parse.corenlp import CoreNLPDependencyParser

In [4]:
################ NLP PIPELINE ######################

In [5]:
stop_words = stopwords.words('english') + list(string.punctuation)
wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [6]:
def Tokenization(sentence):
    tokens = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words]
    return tokens

In [7]:
# Obtains hypernyms
def WordNetHypernyms(sentence, word_tokens):
    # Creates dictionary for hypernyms
    hypernyms_list = []
    
    # Populates the above dictionaries according to the word senses associated with them
    for token in word_tokens:
        # Extracts best sense for each word using LESK
        best_sense = lesk(sentence, token)
        
        if best_sense is not None:
            # Obtains Hypernyms
            if best_sense.hypernyms() != []:
                hypernyms_list.append(best_sense.hypernyms()[0].lemmas()[0].name())
            
    return hypernyms_list

In [8]:
def NLP_Pipeline(sentence):
    word_tokens = Tokenization(sentence)
    hypernyms = WordNetHypernyms(sentence, word_tokens)
    return hypernyms

In [None]:
# sample_text = "A soldier-of-fortune steals some Russian nerve gas from Afghanistan, and brings it to the U.S. to be analyzed. A greedy millionaire rancher finds out about it and sets out to steal it."
# sample_hypernyms = ['combatant', 'country', 'fuel', 'change', 'rich_person', 'farmer', 'feeling', 'gain']
# print('Sample Text: ' + sample_text)
# print('Extracted hypernyms: combatant, country, fuel, change, rich_person, farmer, feeling, gain')
#print(NLP_Pipeline(sample_text))

In [None]:
############## MOVIE MATCHING BETWEEN DATASETS #############################

In [4]:
df_movies_summaries = pd.read_csv(r'Data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
df_movies_summaries.columns = ['Wikipedia ID', 'Freebase ID', 'Movie name', 'Release date', 'BO Revenue', 'Runtime', 'Language', 'Countires', 'Genre']

In [5]:
with open('Data/ml-1m/movies.dat') as dat_file, open('Data/ml-1m/movies.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)

    for line in dat_file:
        row = [field.strip() for field in line.split('::')]
        csv_writer.writerow(row)

df_movies_ratings = pd.read_csv('Data/ml-1m/movies.csv', sep=',', encoding='latin-1')
df_movies_ratings.columns = ['movie_id', 'title', 'genre']

In [6]:
df_movies_ratings['title'] = df_movies_ratings['title'].str.lower()
df_movies_summaries['Movie name'] = df_movies_summaries['Movie name'].str.lower()

In [7]:
movie_ratings_dict = {}
for ix in df_movies_ratings.index:
    movie_id = df_movies_ratings.loc[ix]['movie_id']
    movie_name = df_movies_ratings.loc[ix]['title']
    temp_list = movie_name.split('(')
    movie_name_new = temp_list[0]
    movie_name_new = movie_name_new.replace(' ', '')
    movie_ratings_dict[movie_name_new] = movie_id

In [8]:
movie_id_ratings_summary = {}
for ix in df_movies_summaries.index:
    movie_name = df_movies_summaries.loc[ix]['Movie name']
    movie_id_summary = df_movies_summaries.loc[ix]['Wikipedia ID']
    if movie_name in movie_ratings_dict:
        df_movies_summaries.loc[[ix],['Ratings present']] = 'Y'
        movie_id_rating = movie_ratings_dict[movie_name]
        df_movies_summaries.loc[[ix],['Movie ID Ratings']] = movie_id_rating
        movie_id_ratings_summary[movie_id_rating] = movie_id_summary

In [9]:
df_movies_summaries.to_csv('Output/df_movies_summaries.csv')
with open('Output/movie-id-ratings-summary.txt', 'w') as f:
    print(movie_id_ratings_summary, file=f)


In [None]:
print(df_movies_summaries.nunique())
print(len(movie_id_ratings_summary))

In [None]:
################### GENERATING HYPERNYMS FOR MOVIES FOR WHICH WE HAVE RATINGS DATA ################################

In [17]:
filename = "Data/MovieSummaries/plot_summaries.txt"
counts_dict = {}
movies_dict = {}

with open(filename, encoding="utf8") as file:
    for line in file:
        hypernyms_list = []
        values = line.split("\t")
        synopsis = values[1]
        movieID = values[0]
        if movieID in str(movie_id_ratings_summary.values()):
            hypernyms_list = NLP_Pipeline(synopsis)
            movies_dict[movieID] = hypernyms_list
            for hypernym in hypernyms_list:
                counts_dict[hypernym] = counts_dict.get(hypernym, 0) + 1 

In [77]:
#print(len(movies_dict))
with open('Output/movies-dict.txt', 'w', encoding='utf8') as f:
    print(movies_dict, file=f)

578


In [19]:
counts_dict_sorted = sorted(counts_dict, key=counts_dict.get, reverse=True)
top_1000_counts = counts_dict_sorted[0:1000]
print(len(top_100_counts))

In [None]:
# print(top_1000_counts)
with open('Output/top-1000-counts.txt', 'w') as f:
    print(top_1000_counts, file=f)

In [None]:
############## GENERATING THE MOVIE CONCEPTS ARRAY FOR COLLABORATIVE FILTERING #######################

In [None]:
movies_concepts_array = np.zeros(shape=(len(movies_dict), len(top_1000_counts)))
row_number = -1
for movie in movies_dict:
    row_number = row_number + 1
    col_number = -1
    concepts_list = movies_dict[movie]
    for concept in top_1000_counts:
        col_number = col_number + 1
        if concept in concepts_list:
            movies_concepts_array[row_number][col_number] = 1

# print(movies_concepts_array)

In [22]:
with open('Output/movies-concepts-array.txt', 'w') as f:
    print(movies_concepts_array, file=f)

In [None]:
################### GETTING THE RATINGS DATA #################################

In [18]:
users_dict = dict()
with open('Data/ml-1m/ratings.dat', encoding='utf8') as dat_file, open('Data/ml-1m/ratings.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)

    for line in dat_file:
        row = [field.strip() for field in line.split('::')]
        csv_writer.writerow(row)

df_ratings = pd.read_csv('Data/ml-1m/ratings.csv', sep=',', encoding='latin-1')
df_ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [22]:
print(len(df_ratings))

1000208


In [287]:
# FOR DEBUGGING
# df_ratings = df_ratings[0:40000]
# print(len(df_ratings))

In [None]:
########################### GENERATING A USER RATING MATRIX FOR REVIEWS OF MOVIES THAT HAVE SUMMARY INFORMATION ######################

In [23]:
entries = []
ratings_new_list = []
for ix in df_ratings.index:
    movie_id = df_ratings.loc[ix]['movie_id']
    if float(movie_id) in movie_id_ratings_summary:
        df_ratings.loc[[ix],['Summary Present']] = 'Y'
        df_ratings.loc[[ix],['Movie Summary ID']] = int(movie_id_ratings_summary[float(movie_id)])    

In [24]:
df_ratings = df_ratings[(df_ratings['Summary Present'] == 'Y')]

In [28]:
df_ratings, df_ratings_test = train_test_split(df_ratings, test_size=0.2)

In [19]:
df_ratings = pd.read_csv('Output/df_ratings.csv')
df_ratings_test = pd.read_csv('Output/df_ratings_test.csv')

In [20]:
df_ratings_grouped = df_ratings.groupby('user_id')

In [21]:
print(len(df_ratings_grouped))

6015


In [22]:
ratings_matrix = np.zeros(shape=(len(df_ratings_grouped), len(movies_dict)))
key = 0
row = -1
existing_users = []
for ix in df_ratings.index:
    user_id = df_ratings.loc[ix]['user_id']
    if user_id not in existing_users:
        existing_users.append(user_id)
        row = row + 1
        users_dict[user_id] = row

    movie_id = df_ratings.loc[ix]['movie_id']
    ratings_val = df_ratings.loc[ix]['rating']

    if float(movie_id) in movie_id_ratings_summary:
        movie_id_summary = movie_id_ratings_summary[movie_id]
        if str(movie_id_summary) in movies_dict:
            col = list(movies_dict).index(str(movie_id_summary))
            ratings_matrix[row][col] = ratings_val

In [34]:
with open('Output/users-dict.txt', 'w') as f:
    print(users_dict, file=f)

In [35]:
with open('Output/ratings-matrix.txt', 'w') as f:
    print(ratings_matrix, file=f)

In [109]:
# FOR TESTING
# ratings_matrix = np.zeros(shape=(len(df_ratings_grouped), len(movies_dict)))
# f = open('ratings-matrix.txt', 'r')
# row_num = 0
# col_num = 0
# for line in f:
#     for i in range(len(line)):
#         if line[i] == '0' or line[i] == '1' or line[i] == '2' or line[i] == '3' or line[i] == '4' or line[i] == '5':
#             ratings_matrix[row_num][col_num] = int(line[i])
#             col_num = col_num + 1
#         if line[i] == ']':
#             row_num = row_num + 1
#             col_num = 0 

In [23]:
from numpy import dot
from numpy.linalg import norm

def cosine_sim(movie_1, movie_2):
    if movie_1 not in movies_dict or movie_2 not in movies_dict:
        return 0
    movie_num_1 = list(movies_dict).index(movie_1)
    movie_num_2 = list(movies_dict).index(movie_2)
    a = movies_concepts_array[movie_num_1]
    b = movies_concepts_array[movie_num_2]
    if norm(a) == 0 or norm(b) == 0:
        return 0
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [3]:
def ratings_prediction(user_id, movie_id):
    movie_id = str(movie_id)
    movie_flag = False

    # if movie_id not in movies_dict.keys():
    #     return 0
    for key, value in movies_dict.items():
        if movie_id == key:
            movie_flag = True
    
    if not movie_flag:
        return 0
    if user_id not in users_dict:
        return 0
    
    pred_rating = 0
    count = 0

    user_ix = list(users_dict).index(user_id)

    for key, value in movies_dict.items():
        if key == movie_id:
            continue
        else:
            movie_ix = list(movies_dict).index(key)
            movie_sim = cosine_sim(key, movie_id)
            if movie_sim > 0:
                if ratings_matrix[user_ix][movie_ix] != 0:
                    count = count + movie_sim
                    pred_rating = pred_rating + movie_sim * ratings_matrix[user_ix][movie_ix]
    
    if count != 0:
        return pred_rating / count
    else:
        return 0

In [None]:
pred_ratings = []
count = 0
for ix in df_ratings_test.index:
    movie_id = df_ratings_test.loc[ix]['Movie Summary ID']
    movie_id = str(int(movie_id))
    user_id = df_ratings_test.loc[ix]['user_id']
    pred_rating = ratings_prediction(user_id, movie_id)
    pred_ratings.append(pred_rating)
    count = count + 1

In [174]:
df_ratings_test['Predicted Ratings'] = pred_ratings

In [34]:
actual_rating_list = []
pred_rating_list = []
for ix in df_ratings_test.index:
    actual_rating = df_ratings_test.loc[ix]['rating']
    pred_rating = df_ratings_test.loc[ix]['Predicted Ratings']
    if pred_rating != 0:
        actual_rating_list.append(actual_rating)
        pred_rating_list.append(pred_rating)

In [177]:
df_ratings.to_csv('Output/df_ratings.csv')
df_ratings_test.to_csv('Output/df_ratings_test.csv')

In [37]:
correct = 0
total = 0
for i in range(len(actual_rating_list)):
    if actual_rating_list[i] == round(pred_rating_list[i]):
        correct = correct + 1
    total = total + 1

In [38]:
#Accuracy
print(correct/total)

0.30647723526061804
