In [123]:
from imdbpie import Imdb
import pandas as pd
import urllib
import numpy as np
import requests
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import KFold, cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import mean_squared_error, r2_score, classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.decomposition import PCA


from nltk.tokenize import WordPunctTokenizer
from nltk.tag import pos_tag

from unidecode import unidecode

%matplotlib inline

In [2]:
def get_top_250():
    response = requests.get('http://www.imdb.com/chart/top')
    html = response.text
    entries = re.findall("<a href.*?/title/(.*?)/", html)
    return list(set(entries))

In [3]:
def get_entry(entry):
    res = requests.get('http://www.omdbapi.com/?i='+entry)
    if res.status_code != 200:
        print entry, res.status_code
    else:
        print '.',
    try:
        j = json.loads(res.text)
    except ValueError:
        j = None
    return j

In [4]:
def get_gross(entry):
    response = requests.get('http://www.imdb.com/title/'+entry)
    html = response.text
    try:
        gross_list = re.findall("Gross:</h4>[ ]*\$([^ ]*)", html)
        gross = int(gross_list[0].replace(',', ''))
        print '.',
        return gross
    except Exception as ex:
        print
        print ex, entry, response.status_code
        return None

In [5]:
entries = get_top_250()

In [6]:
entries_dict_list = [get_entry(e) for e in entries]

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


In [7]:
df = pd.DataFrame(entries_dict_list)
del df['Type']
del df['Poster']
del df['Response']
del df['Metascore']
del df['Plot']

In [8]:
#Converts Runtime to INT
df['Runtime'] = pd.Series([int(x.split(' ')[0]) for x in df['Runtime']])

#Converts Year to INT
df['Year'] = pd.Series([int(x) for x in df['Year']])

#Converts IMDB Rating to FLOAT
df['imdbRating'] = pd.Series([float(x) for x in df['imdbRating']])

#Converts IMDB Votes to INT
df['imdbVotes'] = pd.Series([int(x.replace(',', '')) for x in df['imdbVotes']])

df['Year'] = MinMaxScaler().fit_transform(df['Year'].astype(float).reshape(-1, 1))

In [9]:
#Converts N/A to NaNs in order to be dropped.
df['Awards'] = df['Awards'].replace('N/A', np.nan)
#May reconsider above.  Tokenizing may fix need for it.

#Drops NaNs
df.dropna(inplace=True)
df = df.reset_index(drop=True)

In [10]:
df.tail(1)

Unnamed: 0,Actors,Awards,Country,Director,Genre,Language,Rated,Released,Runtime,Title,Writer,Year,imdbID,imdbRating,imdbVotes
245,"Diahnne Abbott, Frank Adu, Victor Argo, Gino A...",Nominated for 4 Oscars. Another 21 wins & 15 n...,USA,Martin Scorsese,"Crime, Drama","English, Spanish",R,08 Feb 1976,113,Taxi Driver,Paul Schrader,0.578947,tt0075314,8.3,515479


In [11]:
df.dtypes

Actors         object
Awards         object
Country        object
Director       object
Genre          object
Language       object
Rated          object
Released       object
Runtime         int64
Title          object
Writer         object
Year          float64
imdbID         object
imdbRating    float64
imdbVotes       int64
dtype: object

In [12]:
df['Rated'].value_counts()

R            105
PG            36
PG-13         33
NOT RATED     28
APPROVED      17
G             12
UNRATED        9
PASSED         4
N/A            1
M              1
Name: Rated, dtype: int64

In [13]:
#Converts the text and removes everything inside of parenthesis
for x in range(len(df['Writer'])):
    df['Writer'][x] = unidecode(df['Writer'][x])
    df['Writer'][x] = re.sub(' \(.*?\)','',df['Writer'][x], flags=re.DOTALL).strip()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
df['Writer'][5]

'Mark L. Smith, Alejandro G. Inarritu, Michael Punke'

In [57]:
df['Genre'][0]

u'Drama, Music'

In [15]:
hasher = HashingVectorizer()
hasher.transform(df['Writer'])

dx  = pd.DataFrame(hasher.transform([df['Writer'][11]]).todense())
dx.transpose().sort_values(0, ascending=False).head(15).transpose()

Unnamed: 0,590975,270319,693976,684148,942296,974508,699048,699051,699050,699049,0,699047,699053,699046,699045
0,0.288675,0.288675,0.288675,0.288675,0.288675,0.288675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#grosses = [(e, get_gross(e)) for e in entries]

In [17]:
#df1 = pd.DataFrame(grosses, columns=['imdbID', 'Gross'])
#df1.head()

In [18]:
'''director_list = []

for x in range(len(df)):
    title = imdb.get_title_by_id(str(df['tconst'][x]))
    try:
        director_list.append(str(title.credits[0]).split('\'')[1])
    except Exception as ex:
        print
        print ex, entry, response.status_code'''
print 




In [100]:
dummy_rated = pd.get_dummies(df["Rated"], prefix="rated")

In [101]:
writer_cvec = CountVectorizer(ngram_range=(2,2), max_features=100)
writer_cvec.fit(df['Writer'])

writer_df  = pd.DataFrame(writer_cvec.transform(df['Writer']).todense(), columns=writer_cvec.get_feature_names())
writer_df.transpose().sort_values(0, ascending=False).transpose().head(1)

Unnamed: 0,akira kurosawa,martin scorsese,paul schrader,orson welles,oliver stone,nolan david,nolan christopher,noel langley,nick schenk,nicholas pileggi,...,francis ford,fran walsh,ford coppola,flaiano tullio,fellini ennio,felix chong,federico fellini,ethan coen,ennio flaiano,wisher jr
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
genre_cvec = CountVectorizer()
genre_cvec.fit(df['Genre'])

genre_df  = pd.DataFrame(genre_cvec.transform(df['Genre']).todense(), columns=genre_cvec.get_feature_names())
genre_df.transpose().sort_values(0, ascending=False).transpose().head(1)

Unnamed: 0,drama,music,action,horror,war,thriller,sport,sci,romance,noir,...,adventure,film,fi,fantasy,family,crime,comedy,biography,animation,western
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
#   ERRORS.  Names with 3 parts throw off tokenization.  needs to split on comma

In [104]:
actors_cvec = CountVectorizer(ngram_range=(2,2), max_features=100)
actors_cvec.fit(df['Actors'])

actors_df  = pd.DataFrame(actors_cvec.transform(df['Actors']).todense(), columns=actors_cvec.get_feature_names())
actors_df.transpose().sort_values(0, ascending=False).transpose().head(1)

Unnamed: 0,paul reiser,max von,paul newman,patrick magee,pacino robert,orson welles,noel appleby,newman robert,natalie portman,murray abraham,...,ford carrie,fisher billy,faye dunaway,ellen burstyn,edward norton,edward furlong,dicaprio tom,diane keaton,del toro,william holden
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
country_cvec = CountVectorizer()
country_cvec.fit(df['Country'])

country_df  = pd.DataFrame(country_cvec.transform(df['Country']).todense(), columns=country_cvec.get_feature_names())
country_df.transpose().sort_values(0, ascending=False).transpose().head(1)

Unnamed: 0,usa,africa,spain,korea,luxembourg,mexico,new,poland,south,soviet,...,canada,china,denmark,emirates,france,germany,hong,india,iran,zealand
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
director_cvec = CountVectorizer(ngram_range=(2,2), max_features=100)
director_cvec.fit(df['Director'])

director_df  = pd.DataFrame(director_cvec.transform(df['Director']).todense(), columns=director_cvec.get_feature_names())
director_df.transpose().sort_values(0, ascending=False).transpose().head(1)

Unnamed: 0,adam elliot,richard linklater,sergio leone,roy hill,ron howard,roman polanski,robert zemeckis,robert hamer,rob reiner,rob minkoff,...,mel gibson,meirelles kátia,mathieu kassovitz,martin scorsese,lee unkrich,john huston,joel coen,james cameron,ingmar bergman,woody allen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
final_df = pd.concat([writer_df, genre_df], axis=1)
final_df = pd.concat([final_df, actors_df], axis=1)
final_df = pd.concat([final_df, country_df], axis=1)
final_df = pd.concat([final_df, director_df], axis=1)
final_df = pd.concat([final_df, dummy_rated], axis=1)
final_df = pd.concat([final_df, df['Year']], axis=1)
final_df = pd.concat([final_df, df['imdbVotes']], axis=1)

In [108]:
final_df.head(1)

Unnamed: 0,akira kurosawa,alan mak,ales adamovich,andrew stanton,billy wilder,bob kane,bob peterson,burny mattinson,cesare zavattini,charles chaplin,...,rated_M,rated_N/A,rated_NOT RATED,rated_PASSED,rated_PG,rated_PG-13,rated_R,rated_UNRATED,Year,imdbVotes
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.978947,413720


In [109]:
X = final_df
y = df['imdbRating']>df['imdbRating'].median()

In [110]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.5, random_state=69)

In [111]:
et = ExtraTreesClassifier(class_weight='balanced', n_jobs=-1)
et.fit(X_train, Y_train)

ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [112]:
y_pred = et.predict(X_test)

In [113]:
print classification_report(Y_test,y_pred)

             precision    recall  f1-score   support

      False       0.72      0.92      0.81        79
       True       0.71      0.34      0.46        44

avg / total       0.72      0.72      0.68       123



In [114]:
conmat = np.array(confusion_matrix(Y_test, y_pred, labels=et.classes_))
confusion = pd.DataFrame(conmat, index=['Under', 'Over'],
                            columns=['Predicted Under Median', 'Predicted Over Median'])
confusion

Unnamed: 0,Predicted Under Median,Predicted Over Median
Under,73,6
Over,29,15


In [116]:
print et.score(X_test, Y_test)

0.715447154472


In [140]:
final_df.shape

(246, 373)

In [124]:
f_df = StandardScaler().fit_transform(X)

In [143]:
cov_mat = np.cov(f_df.T)

In [144]:
eigenValues, eigenVectors = np.linalg.eig(cov_mat)

In [145]:
eig_pairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]
eig_pairs.sort()
eig_pairs.reverse()
for i in eig_pairs[:2]:
    print(i[0],i[1])

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [136]:
pca = PCA(n_components=2)
pca.fit(f_df)
print("The information (explained variance) contained in each principal component: ", pca.explained_variance_ratio_)
print(pca.components_)

('The information (explained variance) contained in each principal component: ', array([ 0.02899399,  0.02539983]))
[[ -1.19807914e-02  -6.13710145e-03  -5.20739919e-03  -1.32430403e-03
   -1.21395812e-02   3.36823684e-02  -4.91301358e-04  -2.78083757e-03
   -5.29879600e-03  -1.18780976e-02  -2.00535315e-03  -1.35091816e-03
    3.66221288e-02  -1.03539493e-02  -5.21386470e-03  -1.94860834e-03
   -2.31849358e-03   2.74564924e-03   3.04056302e-02  -4.80747412e-03
   -5.07538686e-03  -4.86685602e-03  -8.20414690e-04  -3.24449912e-03
   -4.40659491e-03   5.70990285e-03  -2.85469498e-03  -1.45596837e-02
   -7.07403652e-03  -1.45596837e-02  -6.13710145e-03  -1.42567725e-02
   -1.42567725e-02  -1.19027948e-02   2.96310647e-01  -1.19027948e-02
    2.50540613e-02  -4.87741833e-03   2.97432740e-02  -6.33749481e-03
   -2.86428059e-03  -4.97763786e-03  -5.24519877e-03  -1.18724763e-02
   -9.46898848e-03  -1.32975125e-03  -1.32740166e-02   1.66627940e-03
   -1.49266000e-03  -1.18724763e-02  -1.3585

In [142]:
eig_pairs[0][1].shape

(246,)

In [141]:
W = np.hstack((eig_pairs[0][1].reshape(11,1), eig_pairs[1][1].reshape(11,1))) # Our transformation matrix
X_reduced = f_df.dot(W)
X_reduced

ValueError: total size of new array must be unchanged

In [35]:
# country_cvec = CountVectorizer()
# country_cvec.fit(df['Country'])

# country_df  = pd.DataFrame(country_cvec.transform(df['Country']).todense(), columns=country_cvec.get_feature_names())
# country_df.transpose().sort_values(0, ascending=False).transpose().head(1)

In [60]:
hasher = HashingVectorizer()
hasher.transform(df['Country'])

<246x1048576 sparse matrix of type '<type 'numpy.float64'>'
	with 370 stored elements in Compressed Sparse Row format>

In [65]:
country_df  = pd.DataFrame(hasher.transform(df['Country']).todense())
country_df.transpose().sort_values(0, ascending=False).transpose()

Unnamed: 0,675220,0,699041,699043,699044,699045,699046,699047,699048,699049,...,349525,349526,349527,349528,349529,349530,349531,349532,349533,1048575
0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.500000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
for x in range(len(country_df.T)):
    if country_df[x].nunique() > 1:
        print x

2541
19520
40868
188911
197212
242361
368621
377999
391503
433769
442391
443251
444162
493510
509845
518583
560027
594135
597353
629002
633813
652998
668507
675220
681351
684125
722040
724261
734015
743937
870610
876751
898872
910537
946672
999854
1009500
1037137


In [78]:
temp_df = country_df
test_df = temp_df[[ind for ind, x in enumerate(country_df) if country_df[x].nunique() > 1]].T
test_df = test_df.T.reset_index(drop=True)

In [79]:
test_df

Unnamed: 0,2541,19520,40868,188911,197212,242361,368621,377999,391503,433769,...,734015,743937,870610,876751,898872,910537,946672,999854,1009500,1037137
0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
1,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.00000,0.0,-0.707107,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
5,0.000000,0.000000,0.000000,0.0,0.000000,0.500000,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,-0.5,0.500000,0.000000,0.0,0.0,0.0
6,0.000000,0.000000,0.000000,0.0,0.000000,0.707107,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.707107,0.000000,0.0,0.0,0.0
7,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
8,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
9,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.00000,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
