# Import statements 

In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# from pandas.compat import lmap, map, u
from pylab import rcParams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
%reload_ext pycodestyle_magic
%pycodestyle_on
#  %pycodestyle_off

#  Local drive location of the dataframe
MAIN_FOLDER = 'c:/Users/champ/Python_proj/'
MWML_FOLDER = 'made_with_ml_repo/temp/moviebuddy/meta_data/'
FILE = "filtered.csv"
tfidf_np_matrix = ''

# Load from 'local' OR from 'gdrive'
drive = 'local'

if drive == 'gdrive':
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

# Function definitions

In [2]:
def prep_db(file, file_type):
    '''
    This module reads and prepares the db : cleans,
    deletes the columns with 'Unnamed' values,
    fills NaN values
    input :
    file # the movieset file
    file type # csv file type
    output :
    df # read movie set dataframe
    '''
    if file_type == 'csv':
        df = pd.read_csv(file, sep=';', dtype=str)  # Read CSV File
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df = df.fillna('')  # clean the data - get rid of NaN
    df = df.astype(str)  # change the default data type to string
    return(df)


def display_all(df):
    '''
    This function displays all possible columns of the movie database
    '''
    with pd.option_context("display.max_rows", 1000,
                           "display.max_columns", 1000):
        return(df)


def recommendations(titles, tfidf_np_matrix=tfidf_np_matrix):
    idx = []
    recommended_movies = []

    for title in titles:
        idx.append(indices[indices == title].index[0])
    print(idx)
    user_pref_vector = tfidf_np_matrix[idx].mean(axis=0)
    print(user_pref_vector)
    cosine_sim = cosine_similarity(tfidf_matrix,
                                   np.atleast_2d(user_pref_vector))
    df_cosine_sim = pd.DataFrame(cosine_sim, columns=['sim_score'])
    df_cosine_sim = df_cosine_sim.sort_values(by='sim_score', ascending=False)
    print(df_cosine_sim.head(20))
    top_10_indexes = list(df_cosine_sim.iloc[0:(10+len(titles))].index)
    print(top_10_indexes)
    for n, i in enumerate(top_10_indexes):
        if n < len(titles):
            continue
        recommended_movies.append([movies_subset.title.iloc[i],
                                   movies_subset.genres.iloc[i],
                                   movies_subset.keywords.iloc[i],
                                   movies_subset.popularity.iloc[i],
                                   movies_subset.average_vote.iloc[i],
                                   movies_subset.num_votes.iloc[i],
                                   df_cosine_sim.sim_score.iloc[n]])

    return pd.DataFrame(recommended_movies, columns=['movie_title',
                                                     'genres',
                                                     'keywords',
                                                     'popularity',
                                                     'average_vote',
                                                     'num_votes',
                                                     'cosine_score'])

# Main functions

In [3]:
# Load movies
if drive == 'gdrive':
    movies = pd.read_csv("/content/drive/My Drive/" +
                         "MovieRecommender/filtered.csv",
                         header=0, sep=';')
else:
    movies = prep_db(MAIN_FOLDER + MWML_FOLDER + FILE, 'csv')

movies = movies.replace({np.nan: None})
movies.head()
# display_all(movies.tail().T)
# display_all(movies.describe(include='all').T)

'''
Observations:
    Empty values in cell. Specifically,
    Some movies do not have taglines, keywords, date collection,
    revenue, budget, production companies.
Values in genre, keywords columns are stored as one string,
    needs to be split and stored as lists. (#Data cleaning1)
'''

print(display(movies.dtypes))
print(movies.shape)

id                      object
title                   object
tagline                 object
description             object
genres                  object
keywords                object
date                    object
collection              object
runtime                 object
revenue                 object
budget                  object
director                object
cast                    object
production_companies    object
production_countries    object
popularity              object
average_vote            object
num_votes               object
language                object
imdb_id                 object
poster_url              object
dtype: object

None
(45416, 21)


In [4]:
'''
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
'''
movies_subset = movies.dropna(subset=['keywords'])
movies_subset = movies_subset.reset_index(drop=True)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_subset.keywords)
tfidf_np_matrix = tfidf_matrix.toarray()
print(tfidf_np_matrix.shape)
print(tfidf_np_matrix[0])
print(tfidf_np_matrix[0])
indices = pd.Series(movies_subset.title)
print(indices[:5])

(45416, 12595)
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
0    LEGO DC Super Hero Girls: Brain Drain
1                The Truth Is in the Stars
2                                 Firebase
3                                   Zygote
4                Chris D'Elia: Man on Fire
Name: title, dtype: object


# Find similar

In [5]:
recommendations(['Toy Story',
                 'The Dark Knight',
                 'The Dark Knight Rises',
                 'Ace Ventura: When Nature Calls'],
                tfidf_np_matrix=tfidf_np_matrix)

[28363, 7021, 12632, 28875]
[0. 0. 0. ... 0. 0. 0.]
       sim_score
28875   0.577350
12632   0.577350
28363   0.577350
3610    0.355135
11748   0.354092
26861   0.326043
36387   0.309664
27473   0.303791
18901   0.302014
32227   0.287942
21757   0.279752
29951   0.271053
20842   0.264049
31837   0.263899
18065   0.255984
32367   0.241639
10970   0.241020
10265   0.239702
40233   0.236827
31048   0.236061
[28875, 12632, 28363, 3610, 11748, 26861, 36387, 27473, 18901, 32227, 21757, 29951, 20842, 31837]


Unnamed: 0,movie_title,genres,keywords,popularity,average_vote,num_votes,cosine_score
0,Barbie and the Three Musketeers,"animation, family",based on toy,6.670164999999999,6.2,81.0,0.354092
1,Small Soldiers,"comedy, adventure, fantasy, science fiction, a...","defense industry, toy shop, technical toy, sol...",10.03936,6.2,522.0,0.326043
2,Animals Are Beautiful People,"comedy, documentary, family","africa, animal",1.582132,7.5,21.0,0.309664
3,Home Alone 3,"comedy, family","parent child relationship, burglar, child hero...",8.04209,5.1,632.0,0.303791
4,Toy Story 3,"animation, family, comedy","hostage, college, toy, barbie, animation, esca...",16.96647,7.6,4710.0,0.302014
5,Dolls,"fantasy, horror","toy, gore, storm, doll, toy maker",4.545885,6.1,80.0,0.287942
6,Young Black Stallion,"adventure, drama, family","world war ii, human animal relationship, horse...",1.269123,5.4,18.0,0.279752
7,Toys,"fantasy, comedy, science fiction","brother brother relationship, loss of brother,...",5.923774,5.0,173.0,0.271053
8,Father and Son,drama,human relationship,1.559697,5.0,10.0,0.264049
9,Child's Play,"horror, thriller","gun, birthday, voodoo, toy, stalker, murder, b...",19.827546,6.3,605.0,0.263899
