In [1]:
import numpy as np
import pandas as pd

#for reading in data properly
import ast
import json

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import time

Read in the data

In [2]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)

In [3]:
#parse each row to get label vectors from json
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #number of genres we are looking at
        for i in range(numElems):
            genre_str = (json_genres[i]['name'])
            if genre_str in genre_map.keys():
                ret[genre_dict[genre_map[genre_str]]] = 1
        return ret
    except Exception as excep:
        print('Exception' + str(excep))
        return ''

Get dictionary for genre to its index in label vector

In [4]:
genre_dict = {}
genre_dict['Action-Adventure'] = 0
genre_dict['Romance'] = 1
genre_dict['Horror-Thriller'] = 2
genre_dict['Comedy'] = 3
genre_dict['Science Fiction'] = 4
genre_dict['Drama'] = 5

In [5]:
#map original labels to more coarse grained labels
genre_map = {}
genre_map['Adventure'] = 'Action-Adventure'
genre_map['Romance'] = 'Romance'
genre_map['Horror'] = 'Horror-Thriller'
genre_map['Thriller'] = 'Horror-Thriller'
genre_map['Comedy'] = 'Comedy'
genre_map['Fantasy'] = 'Science Fiction'
genre_map['Action'] = 'Action-Adventure'
genre_map['Science Fiction'] = 'Science Fiction'
genre_map['Drama'] = 'Drama'

In [6]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret

In [7]:
getGenresVects() #get label vectors for genres indexed by indexes in genre_dict

In [8]:
#put to lower case, remove punctation, remove stopwords
def cleanText(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text)
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text

all_data['cleanOverview'] = all_data['overview'].apply(cleanText)
all_data = all_data[all_data.genres_vect.map(sum) > 0] #remove rows that don't have labels anymore

In [9]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_vect']]

Train Test Split and getting text features and labels vectors

In [10]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

In [11]:
X_train = train.cleanOverview
X_test = test.cleanOverview

In [12]:
#convert labels from array of lists to numpy array

train_targets_arr = train['genres_vect'].tolist()
train_targets_arr = np.array(train_targets_arr)

test_targets_arr = test['genres_vect'].tolist()
test_targets_arr = np.array(test_targets_arr)

Define class that will do multilabel logistic regression by wrapping Pipelines of tfidf and OneVsRest Logistic Regression Classifiers

In [13]:
class MultiLabelLogisitcRegression():
    def __init__(self, genre_dict):
        self.genre_dict = genre_dict
        self.pipelines = {}
        for category in self.genre_dict.keys():
            self.pipelines[category]=Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', multi_class='ovr'), n_jobs=1)),
            ])
        
    def fit(self, X_train, train_targets_arr):
        start = time.time()
        for category in self.genre_dict.keys():
            print('Processing {}'.format(category))
            self.pipelines[category].fit(X_train, train_targets_arr[:,genre_dict[category]])
        end = time.time()
        print('Time to train ' + str(end-start) + ' seconds')

    def predict(self, X_test):
        Ret = np.zeros((X_test.shape[0],len(self.genre_dict.keys())), dtype='int')
        for category in self.genre_dict.keys():
            try:
                prediction = self.pipelines[category].predict(X_test)
            except: #exception we get is it was trained with data taht was only 0 label during cross validation
                prediction = np.zeros(X_test.shape[0], dtype=int)
            Ret[:,self.genre_dict[category]] = prediction
        return Ret
    
    #unbalanced data so allow prediction with given threshold
    def predict_threshold(self, X_test, threshold):
        Ret = np.zeros((X_test.shape[0],len(self.genre_dict.keys())), dtype='int')
        for category in self.genre_dict.keys():
            try:
                prediction = self.pipelines[category].predict_proba(X_test)[:,1]
            except:#exception we get is it was trained with data taht was only 0 label during cross validation
                prediction = np.zeros(X_test.shape[0], dtype=int)
            prediction[prediction >=threshold] = 1
            prediction[prediction < threshold] = 0
            Ret[:,self.genre_dict[category]] = prediction
        return Ret

Evaluation metric definitions and printing functions

In [14]:
#size of intersection of predicted and actual labels divided by size of their union for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

#size of intersection of predicted and actual labels divided by size of predicted set for each datapoint tested on
#sum those and divide by number of datapoints
#if no predicted labels, don't count that row towards the precision as that would be undefined
def multi_label_precision(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    precision_sum = 0
    num_rows = 0
    for row in range(intersection.shape[0]):
        if sum(predictions_labels_matrix[row]) > 0: #if there is at least one prediction for this row
            num_rows += 1
            precision_sum += sum(intersection[row]) / sum(predictions_labels_matrix[row])
    if num_rows == 0:
        return 0#no labels predicted at all will give us 0 precision as precision makes no sense here
    return precision_sum / num_rows

#size of intersection of predicted and actual labels divided by size of real label set for each datapoint tested on
#sum those and divide by number of datapoints
#all datapoints should have at least 1 real label in this data set
#vectorized for speed
def multi_label_recall(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    #sum(array.T) gets number of 1s in row
    row_wise_recall = sum(intersection.T) / sum(real_labels_matrix.T)
    return sum(row_wise_recall) / real_labels_matrix.shape[0]

#lower is better
def hamming_loss(real_labels_matrix, predictions_labels_matrix):
    return (np.logical_xor(real_labels_matrix, predictions_labels_matrix)).sum()/(real_labels_matrix.shape[0] * real_labels_matrix.shape[1])


In [15]:
def get_per_label_metrics(real_labels_matrix, predictions_labels_matrix):
    for genre in genre_dict.keys():
        index = genre_dict[genre]
        real_labels_vect = real_labels_matrix[:, index]
        prediction_vect = predictions_labels_matrix[:,index]
        print("Accuruacy for " + genre + ": " + str(accuracy_score(real_labels_vect, prediction_vect)))
        print("Precision for " + genre + ": " + str(precision_score(real_labels_vect, prediction_vect)))
        print("Recall for " + genre + ": " + str(recall_score(real_labels_vect, prediction_vect)))
        print()

In [16]:
def get_all_metrics(actual_labels, predictions):
    print('Getting evaluation metrics for each label:')
    get_per_label_metrics(actual_labels, predictions)
    print('Getting evaluations for multilabel problem')
    print('Multilabel accuracy: ' + str(multi_label_accuracy(actual_labels, predictions)))
    print('Multilabel precision: ' + str(multi_label_precision(actual_labels, predictions)))
    print('Multilabel recall: ' + str(multi_label_recall(actual_labels, predictions)))
    print("Percent of correctly decided label decisions: " + str(100* (1-hamming_loss(actual_labels, predictions))))

do multilabel classification

In [17]:
multi = MultiLabelLogisitcRegression(genre_dict)
multi.fit(X_train, train_targets_arr)
results = multi.predict(X_test)

Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 1.4477214813232422 seconds


In [18]:
get_all_metrics(test_targets_arr, results)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.7118055555555556
Precision for Action-Adventure: 0.7948717948717948
Recall for Action-Adventure: 0.164021164021164

Accuruacy for Romance: 0.8038194444444444
Precision for Romance: 1.0
Recall for Romance: 0.017391304347826087

Accuruacy for Horror-Thriller: 0.7222222222222222
Precision for Horror-Thriller: 0.9069767441860465
Recall for Horror-Thriller: 0.2

Accuruacy for Comedy: 0.703125
Precision for Comedy: 0.8421052631578947
Recall for Comedy: 0.16243654822335024

Accuruacy for Science Fiction: 0.8298611111111112
Precision for Science Fiction: 1.0
Recall for Science Fiction: 0.0392156862745098

Accuruacy for Drama: 0.671875
Precision for Drama: 0.6373626373626373
Recall for Drama: 0.8027681660899654

Getting evaluations for multilabel problem
Multilabel accuracy: 0.3344907407407411
Multilabel precision: 0.6927570093457944
Multilabel recall: 0.35170717592592604
Percent of correctly decided label decisions: 7

Notice the poor multilabel metrics despite high accuracy on each label when considered alone. Do cross validation to find better threshold than .5

In [19]:
#k fold cross validation with threshold
def kFoldCrossValidation(X, y, folds, threshold):
    print("Doing cross validation for threshold = " + str(threshold))
    held_out_size = len(X)//folds
    multi_label_acc = 0
    for i in range(folds):
        print("Iteration " + str(i+1) + " of " + str(folds) + " fold cross validation")
        held_out_index = i*held_out_size
        if i == folds-1:
            held_out_data = X[held_out_index:]
            held_out_y = y[held_out_index:]
            iter_training_data = X[0:held_out_index]
            iter_y = y[0:held_out_index]
        else:
            held_out_data = X[held_out_index:held_out_index+held_out_size]
            held_out_y = y[held_out_index:held_out_index+held_out_size]
            iter_training_data = np.append(X[0:held_out_index], X[held_out_index+held_out_size:], axis=0)
            iter_y = np.append(y[0:held_out_index], y[held_out_index+held_out_size:], axis=0)
        multi = MultiLabelLogisitcRegression(genre_dict)
        multi.fit(iter_training_data, iter_y)
        predictions = multi.predict_threshold(held_out_data, threshold)
        multi_label_acc += multi_label_accuracy(held_out_y, predictions)
    return multi_label_acc / folds #sum accross all folds and divide by number of folds

In [20]:
#find best threshold looking from .3 to .7 in ntervals of .05
test_threshold = .3
best_threshold_acc = 0
best_threshold = .5 #default is .5
start = time.time()
while test_threshold <= .7:
    acc = kFoldCrossValidation(X_train, train_targets_arr, 5, test_threshold)
    print("Accuracy: " + str(acc))
    if acc > best_threshold_acc:
        best_threshold_acc = acc
        best_threshold = test_threshold
    test_threshold += .05
end = time.time()
print('Time to run k fold cross validation to find best threshold ' + str(end-start))

Doing cross validation for threshold = 0.3
Iteration 1 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.875037431716919 seconds
Iteration 2 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.7834410667419434 seconds
Iteration 3 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.8242762088775635 seconds
Iteration 4 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.830864667892456 seconds
Iteration 5 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processin

Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.6252717971801758 seconds
Iteration 2 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.6334588527679443 seconds
Iteration 3 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.6999564170837402 seconds
Iteration 4 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.5616471767425537 seconds
Iteration 5 of 5 fold cross validation
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 0.5750701427459717 seconds
Accura

In [21]:
print('Best threshold for multilabel accuracy: ' + str(best_threshold))
classifier = MultiLabelLogisitcRegression(genre_dict)
classifier.fit(X_train, train_targets_arr)
predictions = multi.predict_threshold(X_test, best_threshold)

Best threshold for multilabel accuracy: 0.35
Processing Action-Adventure
Processing Romance
Processing Horror-Thriller
Processing Comedy
Processing Science Fiction
Processing Drama
Time to train 1.1035399436950684 seconds


In [22]:
get_all_metrics(test_targets_arr, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.75
Precision for Action-Adventure: 0.6066350710900474
Recall for Action-Adventure: 0.6772486772486772

Accuruacy for Romance: 0.8350694444444444
Precision for Romance: 0.9545454545454546
Recall for Romance: 0.1826086956521739

Accuruacy for Horror-Thriller: 0.7552083333333334
Precision for Horror-Thriller: 0.640625
Recall for Horror-Thriller: 0.6307692307692307

Accuruacy for Comedy: 0.7135416666666666
Precision for Comedy: 0.5655737704918032
Recall for Comedy: 0.700507614213198

Accuruacy for Science Fiction: 0.8402777777777778
Precision for Science Fiction: 0.8571428571428571
Recall for Science Fiction: 0.11764705882352941

Accuruacy for Drama: 0.5208333333333334
Precision for Drama: 0.5115452930728241
Recall for Drama: 0.9965397923875432

Getting evaluations for multilabel problem
Multilabel accuracy: 0.47592592592592603
Multilabel precision: 0.5967881944444442
Multilabel recall: 0.6962094907407406
Percent 