In [93]:
#import dependencies
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [94]:
df = pd.read_csv('../data/features.csv')

In [95]:
df.head()

Unnamed: 0,Title,Year,Rated,Genre,Director,Actors,imdbRating
0,Jigsaw,2017,R,"Crime, Horror, Mystery","Michael Spierig, Peter Spierig","Matt Passmore, Tobin Bell, Callum Keith Rennie...",6.1
1,Suburbicon,2017,R,"Crime, Drama, Mystery",George Clooney,"Steve Monroe, Gavin Wilde, Landon Gordon, Hope...",5.4
2,Thank You For Your Service,2017,R,"Biography, Drama, War",Jason Hall,"Haley Bennett, Miles Teller, Joe Cole, Amy Sch...",6.6
3,Geostorm,2017,PG-13,"Action, Sci-Fi, Thriller",Dean Devlin,"Gerard Butler, Jim Sturgess, Abbie Cornish, Al...",5.5
4,Same Kind Of Different As Me,2017,PG-13,Drama,Michael Carney,"Renée Zellweger, Jon Voight, Greg Kinnear, Dji...",6.0


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 7 columns):
Title         112 non-null object
Year          112 non-null object
Rated         107 non-null object
Genre         112 non-null object
Director      107 non-null object
Actors        112 non-null object
imdbRating    110 non-null float64
dtypes: float64(1), object(6)
memory usage: 6.2+ KB


### Genre and Rating Exploration

In [97]:
genre_list = []

for row in df['Genre']:
    for genre in row.split(', '):
        if genre not in genre_list:
            genre_list.append(genre) 
            
print genre_list
         
    

['Crime', 'Horror', 'Mystery', 'Drama', 'Biography', 'War', 'Action', 'Sci-Fi', 'Thriller', 'Family', 'Adventure', 'Animation', 'Comedy', 'Romance', 'Short', 'Music', 'Fantasy', 'History', 'Documentary']


In [98]:
rating_list = []

for row in df['Rated']:
    if row not in rating_list:
        rating_list.append(row) 
            
print rating_list
         

['R', 'PG-13', 'PG', nan, 'G', 'TV-14']


In [99]:
df = df[df.Rated != 'TV-14']

In [100]:
df['Rated'] = df['Rated'].fillna('PG-13')

### Feature Vector Construction

The feature vector will consist of an encoding for genre and rating:

Feature Vector:

['Crime', 'Horror', 'Mystery', 'Drama', 'Biography', 'War', 'Action', 'Sci-Fi', 'Thriller', 'Family', 'Adventure', 'Animation', 'Comedy', 'Romance', 'Short', 'Music', 'Fantasy', 'History', 'Documentary', 'Rating']



In [101]:
rating_map = {
    'R': 0,
    'PG-13': 1,
    'PG': 2,
    'G': 3
}

In [102]:
features = []

for index, row in df.iterrows():
    feature_vec = []
    for genre in genre_list:
        if genre in row['Genre'].split(', '):
            feature_vec.append(1)
        else:
            feature_vec.append(0)
    feature_vec.append(rating_map[row['Rated']])
    feature_vec.append(row['Title'])
    features.append(feature_vec)  
    

In [103]:
feature_labels = genre_list
feature_labels.append('Rating')
feature_labels.append('Title')

feature_df = pd.DataFrame(features, columns=feature_labels)


In [105]:
feature_df

Unnamed: 0,Crime,Horror,Mystery,Drama,Biography,War,Action,Sci-Fi,Thriller,Family,...,Animation,Comedy,Romance,Short,Music,Fantasy,History,Documentary,Rating,Title
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Jigsaw
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Suburbicon
2,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Thank You For Your Service
3,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,Geostorm
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Same Kind Of Different As Me
5,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,The Killing Of A Sacred Deer
6,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,2,Wonderstruck
7,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,The Foreigner
8,0,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,Happy Death Day
9,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Mother!
