In [1]:
import pandas as pd
import numpy as np
import urllib
from bs4 import BeautifulSoup
import time
import re
# Use package tmdbsimple to extract data
import tmdbsimple as tmdb
# use "!pip install tmdbsimple" to install
tmdb.API_KEY = '302f3815bea132a8bfe0d7301c9065dd'
import random
import sys
import time
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import f1_score
import scipy as sp
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score



In [2]:
# read in datasets
x_test = pd.read_csv('x_test.csv',index_col=0)
x_train = pd.read_csv('x_train.csv',index_col=0)
y_test = pd.read_csv('y_test.csv',index_col=0)
y_train = pd.read_csv('y_train.csv',index_col=0)

In [3]:
# Function f1_genre
# input: two pandas dataframes, 
    # genre_real: predicted values
    # genre_predict: real values
# output: mean f1 score of each class
def f1_genres(genre_real, genre_predict):
    count_row = len(genre_real)
    if count_row == 0:
        print "No data in dataframe!"
        return
    if count_row != len(genre_predict):
        print "Different length of predicted and real dataframes!"
        return
    count_col = len(genre_real.columns)
    if count_col == 0:
        print "No data in dataframe!"
        return
    if count_col != len(genre_predict.columns):
        print "Different genres of predicted and real dataframes!"
        return
    score = 0
    for i in range(count_col):
        score += f1_score(genre_real[genre_real.columns.values[i]], genre_predict[genre_predict.columns.values[i]])
    score = score/count_col
    return(score)

In [6]:
scaler = preprocessing .StandardScaler().fit(x_train)
x_train_np = scaler.transform(x_train)
x_test_np = scaler.transform(x_test)

indexs_train = x_train.index
indexs_test = x_test.index
x_train = pd.DataFrame(x_train_np, index = indexs_train, columns = x_train.columns)
x_test = pd.DataFrame(x_test_np, index = indexs_test, columns = x_test.columns)

### methods support multilabel classification

In [7]:
# KNN
knn = KNN(n_neighbors=1)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
y_pred= pd.DataFrame(y_pred, columns = y_test.columns.values)
score_knn = f1_genres(y_test, y_pred)
score_knn

0.2734998744506037

In [8]:
# Decision Tree
tree = DecisionTree(max_depth=6)
tree.fit(x_train, y_train)

y_pred = tree.predict(x_test)
y_pred= pd.DataFrame(y_pred, columns = y_test.columns.values)
score_tree = f1_genres(y_test, y_pred)
score_tree

  'precision', 'predicted', average, warn_for)


0.16952043858096622

In [9]:
# Random Forest
rf = RandomForest()
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)
y_pred= pd.DataFrame(y_pred, columns = y_test.columns.values)
score_rf = f1_genres(y_test, y_pred)
score_rf

0.1353949600455715

### Other classifiers
These classifiers don't support multilabel classification, thus need to fit classifier for each genre, then combine

In [10]:
#Unweighted logistic regression

genre_pred = pd.DataFrame(index = x_test.index) # dataframe to store predicted values

for col in y_train.columns:
    unweighted_logistic = LogisticRegression()
    unweighted_logistic.fit(x_train, y_train[col])
    genre_pred[col]= unweighted_logistic.predict(x_test)

score_unweighted_log = f1_genres(y_test, genre_pred)
print score_unweighted_log

0.310900618511


In [11]:
#weighted logistic regression

genre_pred = pd.DataFrame(index = x_test.index) # dataframe to store predicted values

for col in y_train.columns:
    weighted_logistic = LogisticRegression(class_weight='balanced')
    weighted_logistic.fit(x_train, y_train[col])
    genre_pred[col]= weighted_logistic.predict(x_test)

score_weighted_log = f1_genres(y_test, genre_pred)
print score_weighted_log

0.379186272696


In [12]:
#LDA

genre_pred = pd.DataFrame(index = x_test.index) # dataframe to store predicted values

for col in y_train.columns:
    lda = LDA()
    lda.fit(x_train, y_train[col])
    genre_pred[col]= lda.predict(x_test)

score_lda = f1_genres(y_test, genre_pred)
print score_lda



0.313788098572


In [13]:
#QDA

genre_pred = pd.DataFrame(index = x_test.index) # dataframe to store predicted values

for col in y_train.columns:
    qda = QDA()
    qda.fit(x_train, y_train[col])
    genre_pred[col]= qda.predict(x_test)

score_qda = f1_genres(y_test, genre_pred)
print score_qda



0.193498634524


In [15]:
# SVM

genre_pred = pd.DataFrame(index = x_test.index) # dataframe to store predicted values

for col in y_train.columns:
    svm = SVC(C=5, class_weight='balanced')
    svm.fit(x_train, y_train[col])
    genre_pred[col]= svm.predict(x_test)

score_svm = f1_genres(y_test, genre_pred)
print score_svm

0.386071061144


In [16]:
#Score Dataframe
score_df = pd.DataFrame({'knn': score_knn, 
                         'tree': score_tree,
                         'rf': score_rf,
                         'unweighted logistic': score_unweighted_log,
                         'weighted logistic': score_weighted_log,
                         'lda': score_lda,
                         'qda': score_qda,                        
                         'weighted svm': score_svm}, index = ['f1_score'])
score_df

Unnamed: 0,knn,lda,qda,rf,tree,unweighted logistic,weighted logistic,weighted svm
f1_score,0.2735,0.313788,0.193499,0.135395,0.16952,0.310901,0.379186,0.386071


#### By briefly checking , we found that weighted logistic regression and weighted SVM performs best here
#### Thus we tune the parameter C for each columns to optimize its performance

In [17]:
# function F_score takes model, predictors X and true y values and returns f1_score
# this function is modified to suit cross validation format

def F_score_cv(model, X, y_true):
    y_predict = model.predict(X)
    score = f1_score(y_predict, y_true)
    return score

In [20]:
genre_pred = pd.DataFrame(index = x_test.index) # dataframe to store predicted values

for col in y_train.columns:
    k_cv = 5
    score_cv = []
    for i in range(-6, 7, 2):
        # fit regularized logistic regression model on training set      
        weighted_logistic = LogisticRegression(class_weight='balanced', C = 10**i)
        score_cv += [sum(cross_val_score(weighted_logistic, x_train, y_train[col], cv = k_cv, scoring = F_score_cv)) / k_cv]
    # find best score and corresponding tuning parameter
    max_value = max(score_cv)
    max_index = score_cv.index(max_value)
    C_best = 10**(max_index -7)
    print col, C_best
    weighted_logistic = LogisticRegression(class_weight='balanced',  C = C_best)
    weighted_logistic.fit(x_train, y_train[col])
    genre_pred[col]= weighted_logistic.predict(x_test)

score_weighted_log = f1_genres(y_test, genre_pred)
print score_weighted_log

Action 0.01
Adventure 0.001
Comedy 0.0001
Crime 0.01
Fantasy 0.001
Family 10
Romance 0.001
Horror 1000
Western 100000
Documentary 0.1
Biography 0.01
Drama 0.001
Animation 100
Sci-Fi 100
Thriller 0.001
Short 0.01
Mystery 0.1
Sport 0.01
War 0.01
History 1
Music 10
Foreign 100
Other 0.001
0.372648609453


In [21]:
# Function score_genre
# input: two pandas dataframes, 
    # genre_real: predicted values
    # genre_predict: real values
# output: mean accuracy of prediction
# accuracy score here is defined as 
    # (intersection between real and predicted vectors) / (union between real and predicted vectors)

def score_genre(genre_real, genre_predict):
    count_row = len(genre_real)
    if count_row == 0:
        print "No data in dataframe!"
        return
    if count_row != len(genre_predict):
        print "Different length of predicted and real dataframes!"
        return
    count_col = len(genre_real.columns)
    if count_col == 0:
        print "No data in dataframe!"
        return
    if count_col != len(genre_predict.columns):
        print "Different genres of predicted and real dataframes!"
        return
    accuracy_genre = 0.0
    for i in range(count_row):
        count_intersection = 0.0
        count_unity = 0.0
        accuracy_temp = 0.0
        for j in range(len(genre_real.columns)):
            if genre_real.iloc[i][j] == 1 or genre_predict.iloc[i][j] == 1:
                count_unity += 1.0
            if genre_real.iloc[i][j] == 1 and genre_predict.iloc[i][j] == 1:
                count_intersection += 1.0 
        if count_unity == 0: # a few ovservations has no genre assigned, delete these values from evaluation
            count_row = count_row - 1
        else:
            accuracy_temp = count_intersection / count_unity
            accuracy_genre += accuracy_temp
    if count_row <= 0:
        print "No meaning value!"
        return
    return (accuracy_genre/count_row)

In [22]:
score_genre(y_test, genre_pred)

0.2712393269038586

-----