In [None]:
# %matplotlib inline

import pandas as pd
import numpy as np
from sklearn import linear_model, decomposition, cross_validation, neighbors
from sklearn.learning_curve import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
from sqlalchemy import create_engine
from IPython.display import display

In [None]:
# set up default parameters for making nice plots
params = {
    'axes.color_cycle': ['5DA5DA', 'FAA43A', '60BD68', 'F17CB0', 'B2912F', 'B276B2', 'DECF3F', 'F15854', '4D4D4D'],
    'axes.labelsize': 14,
    'font.size': 14,
    'legend.fontsize': 14,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'axes.linewidth': 2,
    'xtick.major.width': 2,
    'ytick.major.width': 2,
    'lines.linewidth': 2,
    'lines.marker': 'o'
   }
plt.rcParams.update(params)

In [None]:
# load the data
engine = create_engine('mysql://root:pass@localhost/moviedb?charset=utf8', encoding = 'utf-8')
df1 = pd.read_sql_table('themoviedb', engine)
df2 = pd.read_sql_table('omdb', engine)

In [None]:
display(df1.loc[0:1,:])
df2.loc[0:1,:]

In [None]:
# make genres the columns of a new data frame, with binary entries
s1 = df1['genres']
s2 = df2['Genre']
movies = df2['Title']
dfgenre = pd.DataFrame()
for ind in s1.index:
    # filter out empty strings, and merge genre information from the two databases
    temp = np.array(filter(None, s1[ind].split(', ') + s2[ind].split(', ')), dtype = object)
    temp[temp == 'Sci-Fi'] = 'Science Fiction'
    temp = set(temp)
    s = pd.Series([1]*len(temp), index = temp, name = ind)
    dfgenre = dfgenre.append(s)
dfgenre = dfgenre.fillna(0)
genres = dfgenre.columns.values

In [None]:
# take a look at genre compositions
np.sum(dfgenre).sort_values()

In [None]:
# get rid of foreign, short, and documentary, since there are only a few data points for those
# remove rows first, then columns
ind1 = np.logical_not(dfgenre[['Foreign', 'Short', 'Documentary']].apply(any, axis = 1))
col1 = np.logical_not(np.in1d(dfgenre.columns, ['Foreign', 'Short', 'Documentary']))

# focus on rotten tomatoes for now (it's the most well known, even if it's not the best metric)
# throw out the ones that have no rotten or no fresh (to avoid issue with logistic function)
ind2 = np.logical_not(np.in1d(df2['tomatoFresh'], ['N/A', '0'])) & np.logical_not(np.in1d(df2['tomatoRotten'], ['N/A', '0']))

In [None]:
# put together data to use with linear regression model
X = dfgenre.loc[ind1 & ind2, col1]
fresh = df2['tomatoFresh'][ind1 & ind2].map(float)
rotten = df2['tomatoRotten'][ind1 & ind2].map(float)
y = fresh / (fresh + rotten)   # calculate tomatometer to more than just 2 digits

In [None]:
# normalize row vectors to be the same length in genre-space
numgenres = np.sum(X, axis = 1)
X = X.apply(lambda x: x/np.sqrt(numgenres))

In [None]:
# define an inverse logistic function to transform output to go from (0,1) to (-inf,+inf)
def fun(y): return np.log(y/(1-y))
def ifun(yprime): return 1/(1+np.exp(-yprime))

In [None]:
# plot histograms of the output data before and after transformation
plt.hist(y)
plt.xlabel('Tomatometer Score')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
plt.hist(fun(y))
plt.xlabel('Logistically Mapped Tomatometer Score')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
# fit a linear regression model, using cross-validation to pick regularization parameter
k = 10
kf = cross_validation.KFold(y.size, k, shuffle = True)
reg = linear_model.RidgeCV(alphas = (0, 0.03, 0.1, 0.3, 1, 3, 10), cv = kf).fit(X,fun(y))
reg.alpha_

In [None]:
# fit a new model using the correct alpha
reg = linear_model.Ridge(alpha = 3).fit(X,fun(y))

In [None]:
# look at coefficients and intercept
print reg.intercept_
reg.coef_

In [None]:
# check the R-square (should be greater than zero)
reg.score(X,fun(y))

In [None]:
# make a table of the relevant results
# calculate average movie score for each genre
means = []
for name in genres[col1]:
    ind = X[name] != 0
    means.append(y[ind].mean())
means = np.array(means)

dfprint = pd.DataFrame()
dfprint['Genres'] = genres[col1]
dfprint['Weights'] = reg.coef_
dfprint['Pure Genre Score'] = ifun(reg.coef_ + reg.intercept_)
dfprint['Average Movie Score'] = means
dfprint = dfprint.set_index('Genres')
dfprint['Number of Movies'] = dfgenre.loc[ind1 & ind2, col1].sum()
dfprint = dfprint.sort_values('Weights', ascending = False)
dfprint

In [None]:
# compute learning curves using K-fold cross-validation
size, train, test = learning_curve(reg, X, fun(y), train_sizes = np.arange(0.05, 1, 0.05), cv = kf)

In [None]:
# plot the learning curves
plt.plot(size, train.mean(1), label = 'Training Set')
plt.plot(size, test.mean(1), label = 'CV Set')
legend = plt.legend(loc = 4)
legend.get_frame().set_linewidth(2)
plt.xlabel('Training Set Size')
plt.ylabel('Average R-square')
plt.title('Learning Curves with K-fold CV = %d' % k)
plt.show()

In [None]:
# plot the predicted movie scores versus the actual movie scores
ypred = ifun(reg.predict(X))
plt.scatter(y, ypred)
plt.xlabel('Actual Tomatometer Score')
plt.ylabel('Predicted Tomatometer Score')
plt.show()

In [None]:

# just the genre data is interesting, try unsupervised learning
# fit a PCA model
pca = decomposition.PCA().fit(X)

In [None]:
# take a look at the means and components for one of the eigenvectors
print pca.mean_
pca.components_[0,:]

In [None]:
# transform data into PCA-space
Xpca = pca.transform(X)

# compute eigenvalues (just the std, pca model doesn't return this for some reason)
eigval = np.std(Xpca, axis = 0)

# inverse transform to find what "movie" an eigenvector would represent
vec = pca.inverse_transform(np.diag(eigval))

In [None]:
# do a k-nearest-neighbor search to find the movies near the eigenvectors (i.e. the best examples of each meta-genre)
num = 10
knn = neighbors.NearestNeighbors(n_neighbors = num).fit(X)
dist, indknn = knn.kneighbors(vec)

In [None]:
# print out some useful results from the pca
dftemp = dfgenre.loc[ind1 & ind2, col1]
for row in range(20):
    # list eigenvectors in decreasing order of explained variance
    print 'Percent variance explained = %g' % pca.explained_variance_ratio_[row]
    
    # print a table listing the eigenvector components next to their corresponding genre
    ind = vec[row,:].argsort()[::-1]
    dfprint = pd.DataFrame(columns = genres[col1][ind])
    dfprint = dfprint.append(pd.Series(vec[row,ind], index = genres[col1][ind], name = 'Components'))
    display(dfprint)
    
    # print the titles of the nearest movies, along with their genres
    mov = movies[ind1 & ind2].values[indknn[row,:]]
    print 'Example movies:'
    for tempind in range(num):
        gen = ', '.join(dftemp.columns[dftemp.values[indknn[row,tempind]] == 1])
        print '%-40s (%s)' % (mov[tempind], gen)
    print ''