In [1082]:
#import dependencies
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [1083]:
df = pd.read_csv('../data/features.csv')
df_2016 = pd.read_csv('../data/features_2016.csv')

combined = [df, df_2016]

In [1084]:
df.head()

Unnamed: 0,Title,Year,Rated,Genre,Director,Actors,imdbRating
0,Jigsaw,2017,R,"Crime, Horror, Mystery","Michael Spierig, Peter Spierig","Matt Passmore, Tobin Bell, Callum Keith Rennie...",6.1
1,Suburbicon,2017,R,"Crime, Drama, Mystery",George Clooney,"Steve Monroe, Gavin Wilde, Landon Gordon, Hope...",5.4
2,Thank You For Your Service,2017,R,"Biography, Drama, War",Jason Hall,"Haley Bennett, Miles Teller, Joe Cole, Amy Sch...",6.6
3,Geostorm,2017,PG-13,"Action, Sci-Fi, Thriller",Dean Devlin,"Gerard Butler, Jim Sturgess, Abbie Cornish, Al...",5.5
4,Same Kind Of Different As Me,2017,PG-13,Drama,Michael Carney,"Renée Zellweger, Jon Voight, Greg Kinnear, Dji...",6.0


In [1085]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 7 columns):
Title         112 non-null object
Year          112 non-null object
Rated         107 non-null object
Genre         112 non-null object
Director      107 non-null object
Actors        112 non-null object
imdbRating    110 non-null float64
dtypes: float64(1), object(6)
memory usage: 6.2+ KB


In [1086]:
df['Title']

0                                Jigsaw
1                            Suburbicon
2            Thank You For Your Service
3                              Geostorm
4          Same Kind Of Different As Me
5          The Killing Of A Sacred Deer
6                          Wonderstruck
7                         The Foreigner
8                       Happy Death Day
9                               Mother!
10                    Blade Runner 2049
11              The Mountain Between Us
12            My Little Pony: The Movie
13                        American Made
14                           Flatliners
15          Kingsman: The Golden Circle
16               The Lego Ninjago Movie
17                       Friend Request
18                    American Assassin
19                     All I See Is You
20                                   It
21                           Home Again
22                             Polaroid
23                           All Saints
24                          Tulip Fever


### Genre and Rating Exploration

In [1087]:
genre_list = []

for dataset in combined:
    for row in dataset['Genre']:
        for genre in row.split(', '):
            if genre not in genre_list:
                genre_list.append(genre) 
            
print genre_list

['Crime', 'Horror', 'Mystery', 'Drama', 'Biography', 'War', 'Action', 'Sci-Fi', 'Thriller', 'Family', 'Adventure', 'Animation', 'Comedy', 'Romance', 'Short', 'Music', 'Fantasy', 'History', 'Documentary', 'Western', 'Sport']


In [1088]:
rating_list = []

for dataset in combined:
    for row in dataset['Rated']:
        if row not in rating_list:
            rating_list.append(row) 
            
print rating_list
         

['R', 'PG-13', 'PG', nan, 'G', 'TV-14', 'NOT RATED']


In [1089]:
df = df[df.Rated != 'TV-14']

In [1090]:
df['Rated'] = df['Rated'].fillna('PG-13')
df_2016['Rated'] = df_2016['Rated'].fillna('PG-13')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Feature Vector Construction

The feature vector will consist of an encoding for genre and rating:

Feature Vector:

['Crime', 'Horror', 'Mystery', 'Drama', 'Biography', 'War', 'Action', 'Sci-Fi', 'Thriller', 'Family', 'Adventure', 'Animation', 'Comedy', 'Romance', 'Short', 'Music', 'Fantasy', 'History', 'Documentary', 'Rating']



In [1091]:
rating_map = {
    'R': 0,
    'PG-13': 1,
    'PG': 2,
    'G': 3,
    'NOT RATED': 4
}

In [1092]:
features = []

for index, row in df.iterrows():
    feature_vec = []
    for genre in genre_list:
        if genre in row['Genre'].split(', '):
            feature_vec.append(1)
        else:
            feature_vec.append(0)
    feature_vec.append(rating_map[row['Rated']])
    feature_vec.append(row['Title'])
    features.append(feature_vec)  
    

In [1093]:
print genre_list

['Crime', 'Horror', 'Mystery', 'Drama', 'Biography', 'War', 'Action', 'Sci-Fi', 'Thriller', 'Family', 'Adventure', 'Animation', 'Comedy', 'Romance', 'Short', 'Music', 'Fantasy', 'History', 'Documentary', 'Western', 'Sport']


In [1094]:
feature_labels = genre_list
feature_labels.append('Rating')
feature_labels.append('Title')

feature_df = pd.DataFrame(features, columns=feature_labels)


In [1095]:
print genre_list

['Crime', 'Horror', 'Mystery', 'Drama', 'Biography', 'War', 'Action', 'Sci-Fi', 'Thriller', 'Family', 'Adventure', 'Animation', 'Comedy', 'Romance', 'Short', 'Music', 'Fantasy', 'History', 'Documentary', 'Western', 'Sport', 'Rating', 'Title']


In [1096]:
feature_df

Unnamed: 0,Crime,Horror,Mystery,Drama,Biography,War,Action,Sci-Fi,Thriller,Family,...,Romance,Short,Music,Fantasy,History,Documentary,Western,Sport,Rating,Title
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Jigsaw
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Suburbicon
2,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Thank You For Your Service
3,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,Geostorm
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Same Kind Of Different As Me
5,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,The Killing Of A Sacred Deer
6,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,2,Wonderstruck
7,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,The Foreigner
8,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,Happy Death Day
9,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Mother!


### Create Test User


Blade Runner
Dunkirk
Guardians Of The Galaxy Vol. 2
Kong: Skull Island
The Lego Batman Movie
Hidden Figures
The Fate Of The Furious

In [1097]:
user_history_vec = []

user_history_titles = ["Dunkirk", 
                       "Guardians Of The Galaxy Vol. 2", 
                       "Kong: Skull Island", 
                       "The Lego Batman Movie", 
                       "Hidden Figures", 
                       "The Fate Of The Furious",
                       "Blade Runner 2049"]

for index, row in df.iterrows():
    if row['Title'] in user_history_titles:
        user_history_vec.append(1)
    else:
        user_history_vec.append(0)
        
print user_history_vec

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [1098]:
len(user_history_vec)

110

### Generate User Profile

In [1099]:
profile_df = feature_df.drop(['Rating', 'Title'], axis=1)

In [1100]:
profile_df.head()

Unnamed: 0,Crime,Horror,Mystery,Drama,Biography,War,Action,Sci-Fi,Thriller,Family,...,Animation,Comedy,Romance,Short,Music,Fantasy,History,Documentary,Western,Sport
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1101]:
profile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 21 columns):
Crime          110 non-null int64
Horror         110 non-null int64
Mystery        110 non-null int64
Drama          110 non-null int64
Biography      110 non-null int64
War            110 non-null int64
Action         110 non-null int64
Sci-Fi         110 non-null int64
Thriller       110 non-null int64
Family         110 non-null int64
Adventure      110 non-null int64
Animation      110 non-null int64
Comedy         110 non-null int64
Romance        110 non-null int64
Short          110 non-null int64
Music          110 non-null int64
Fantasy        110 non-null int64
History        110 non-null int64
Documentary    110 non-null int64
Western        110 non-null int64
Sport          110 non-null int64
dtypes: int64(21)
memory usage: 18.1 KB


In [1102]:
user_vec = np.array(user_history_vec)

In [1103]:
user_vec.shape

(110,)

In [1104]:
profile_df.shape

(110, 21)

In [1105]:
user_profile = user_vec.dot(profile_df)

In [1106]:
print user_profile

[1 0 1 2 1 0 5 2 1 0 4 1 0 0 0 0 1 2 0 0 0]


In [1107]:
len(user_profile)

21

In [1108]:
genre_list = genre_list[0:21]

In [1109]:
for i in range(21):
    print genre_list[i]
    print user_profile[i]    
    print '_'*20

Crime
1
____________________
Horror
0
____________________
Mystery
1
____________________
Drama
2
____________________
Biography
1
____________________
War
0
____________________
Action
5
____________________
Sci-Fi
2
____________________
Thriller
1
____________________
Family
0
____________________
Adventure
4
____________________
Animation
1
____________________
Comedy
0
____________________
Romance
0
____________________
Short
0
____________________
Music
0
____________________
Fantasy
1
____________________
History
2
____________________
Documentary
0
____________________
Western
0
____________________
Sport
0
____________________


In [1110]:
scaling_factor = float(max(user_profile))

print 'Raw Profile'
print user_profile

user_profile = [round(x/scaling_factor, 1) for x in user_profile]

print 'Normalized Profile'
print user_profile

Raw Profile
[1 0 1 2 1 0 5 2 1 0 4 1 0 0 0 0 1 2 0 0 0]
Normalized Profile
[0.2, 0.0, 0.2, 0.4, 0.2, 0.0, 1.0, 0.4, 0.2, 0.0, 0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.2, 0.4, 0.0, 0.0, 0.0]


### Load 2016 Validation Data

In [1111]:
#df_2016 = pd.read_csv('../data/features_2016.csv')

In [1112]:
df_2016.head()

Unnamed: 0,Title,Year,Rated,Genre,Director,Actors,imdbRating
0,The Forest,2016,PG-13,"Horror, Mystery, Thriller",Jason Zada,"Natalie Dormer, Eoin Macken, Stephanie Vogt, O...",4.8
1,The Revenant,2015,R,"Adventure, Drama, History",Alejandro G. Iñárritu,"Leonardo DiCaprio, Tom Hardy, Domhnall Gleeson...",8.0
2,13 Hours,2016,R,"Action, Drama, History",Michael Bay,"John Krasinski, James Badge Dale, Pablo Schrei...",7.3
3,Norm of the North,2016,PG,"Animation, Adventure, Comedy",Trevor Wall,"Rob Schneider, Heather Graham, Ken Jeong, Bill...",3.6
4,Ride Along 2,2016,PG-13,"Action, Comedy",Tim Story,"Ice Cube, Kevin Hart, Tika Sumpter, Benjamin B...",5.9


In [1113]:
df_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 7 columns):
Title         140 non-null object
Year          140 non-null int64
Rated         140 non-null object
Genre         140 non-null object
Director      139 non-null object
Actors        140 non-null object
imdbRating    139 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 7.7+ KB


In [1114]:
print genre_list

['Crime', 'Horror', 'Mystery', 'Drama', 'Biography', 'War', 'Action', 'Sci-Fi', 'Thriller', 'Family', 'Adventure', 'Animation', 'Comedy', 'Romance', 'Short', 'Music', 'Fantasy', 'History', 'Documentary', 'Western', 'Sport']


In [1115]:
features_2016 = []

for index, row in df_2016.iterrows():
    feature_vec = []
    for genre in genre_list:
        if genre in row['Genre'].split(', '):
            feature_vec.append(1)
        else:
            feature_vec.append(0)
    feature_vec.append(rating_map[row['Rated']])
    feature_vec.append(row['Title'])
    features_2016.append(feature_vec)  
    

In [1116]:
print feature_labels

['Crime', 'Horror', 'Mystery', 'Drama', 'Biography', 'War', 'Action', 'Sci-Fi', 'Thriller', 'Family', 'Adventure', 'Animation', 'Comedy', 'Romance', 'Short', 'Music', 'Fantasy', 'History', 'Documentary', 'Western', 'Sport', 'Rating', 'Title']


In [1117]:
feature_labels_2016 = genre_list
feature_labels_2016.append('Rating')
feature_labels_2016.append('Title')

feature_df_2016 = pd.DataFrame(features_2016, columns=feature_labels_2016)

In [1118]:
feature_df_2016.head()

Unnamed: 0,Crime,Horror,Mystery,Drama,Biography,War,Action,Sci-Fi,Thriller,Family,...,Romance,Short,Music,Fantasy,History,Documentary,Western,Sport,Rating,Title
0,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,The Forest
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,The Revenant
2,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,13 Hours
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,Norm of the North
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,Ride Along 2


In [1119]:
distances = []

for index, row in feature_df_2016.iterrows():
    title = row['Title']
    print title
    feature_vec = row.drop(['Rating', 'Title'], axis=0)
    dist = np.linalg.norm(user_profile-feature_vec)
    print dist
    print '_'*40
    
    distances.append([title, dist])
    
    

The Forest
2.13541565041
________________________________________
The Revenant
1.46969384567
________________________________________
13 Hours
1.32664991614
________________________________________
Norm of the North
1.83303027798
________________________________________
Ride Along 2
1.53622914957
________________________________________
Where to Invade Next
2.08806130178
________________________________________
The 5th Wave
0.979795897113
________________________________________
The Boy
2.13541565041
________________________________________
Dirty Grandpa
1.83303027798
________________________________________
The Finest Hours
1.32664991614
________________________________________
Jane Got a Gun
1.6
________________________________________
Kung Fu Panda 3
1.16619037897
________________________________________
The Choice
1.88679622641
________________________________________
Hail, Caesar!
1.98997487421
________________________________________
Pride and Prejudice and Zombies
1.83303027798


1.6
________________________________________
Bad Santa 2
2.03960780544
________________________________________
Moana
1.83303027798
________________________________________
Rules Don't Apply
2.13541565041
________________________________________
Miss Sloane
1.77763888346
________________________________________
Incarnate
1.98997487421
________________________________________
Office Christmas Party
1.83303027798
________________________________________
Collateral Beauty
1.88679622641
________________________________________
La La Land
2.13541565041
________________________________________
Rogue One
0.979795897113
________________________________________
Assassin's Creed
1.16619037897
________________________________________
Passengers
1.83303027798
________________________________________
Sing
2.22710574513
________________________________________
Why Him?
1.83303027798
________________________________________
Fences
1.6
________________________________________


In [1120]:
def takeSecond(elem):
    return elem[1]

distances.sort(key=takeSecond)

In [1121]:
for distance in distances:
    print distance

['The 5th Wave', 0.97979589711327131]
['Risen', 0.97979589711327131]
['Batman v Superman: Dawn of Justice', 0.97979589711327131]
['Hardcore Henry', 0.97979589711327131]
["The Huntsman: Winter's War", 0.97979589711327131]
['Captain America: Civil War', 0.97979589711327131]
['X-Men: Apocalypse', 0.97979589711327131]
['Independence Day: Resurgence', 0.97979589711327131]
['The Legend of Tarzan', 0.97979589711327131]
['Rogue One', 0.97979589711327131]
['Kung Fu Panda 3', 1.1661903789690602]
['Gods of Egypt', 1.1661903789690602]
['Allegiant', 1.1661903789690602]
['Ratchet & Clank', 1.1661903789690602]
['Angry Birds', 1.1661903789690602]
['Warcraft', 1.1661903789690602]
['Suicide Squad', 1.1661903789690602]
['Mechanic Resurrection', 1.1661903789690602]
['Jack Reacher: Never Go Back', 1.1661903789690602]
['Inferno', 1.1661903789690602]
['Doctor Strange', 1.1661903789690602]
["Assassin's Creed", 1.1661903789690602]
['13 Hours', 1.3266499161421601]
['The Finest Hours', 1.3266499161421601]
['Dead