In [None]:
import numpy as np
import pandas as pd

#for reading in data properly
import ast
import json

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import time

Read in the data

In [None]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)
genre_set = {'Comedy'}

In [None]:
#get set of all genres in dataset    
def parse_all_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        for i in range(numElems):
            genre_set.add(json_genres[i]['name'])
    except:
        return ''

#parse genres of specific example in dataset to get their label vector
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #20 0s
        for i in range(numElems):
            ret[genre_dict[(json_genres[i]['name'])]] = 1
        return ret
    except:
        return ''
    
#get list of labels this row has in string format
def get_labels_as_strs(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = []
        for i in range(numElems):
            ret.append(json_genres[i]['name'])
        return ret
    except:
        return ''

In [None]:
def getAllGenres():
    y = all_data['genres']
    y.apply(parse_all_genres_json)

In [None]:
getAllGenres()

In [None]:
#get set to dictionary for indexing of label vectors
genre_dict = {}
index = 0
for genre in genre_set:
    genre_dict[genre] = index
    index += 1

In [None]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret
    label_strs = y.apply(get_labels_as_strs) #not currently used but could be useful
    all_data['genres_labels'] = label_strs

In [None]:
getGenresVects() #get labels in vector form

In [None]:
#put to lower case, remove punctation, remove stopwords
def cleanText(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text)
    text = re.sub(r'[^a-z A-Z0-9]', "", text)
    text = text.lower()
    return text

#clean up the overview field and put it in cleanOverview
all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [None]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_labels', 'genres_vect']]

Train Test Split and getting text features and labels vectors

In [None]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

In [None]:
X_train = train.cleanOverview
X_test = test.cleanOverview

In [None]:
#convert labels from array of lists to numpy array

train_targets_arr = train['genres_vect'].tolist()
train_targets_arr = np.array(train_targets_arr)

test_targets_arr = test['genres_vect'].tolist()
test_targets_arr = np.array(test_targets_arr)

Define class that will do multilabel logistic regression by wrapping Pipelines of tfidf and OneVsRest Logistic Regression Classifiers

In [None]:
class MultiLabelLogisitcRegression():
    def __init__(self, genre_dict):
        self.genre_dict = genre_dict
        self.pipelines = {}
        for category in self.genre_dict.keys():
            self.pipelines[category]=Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', multi_class='ovr'), n_jobs=1)),
            ])
        
    def fit(self, X_train, train_targets_arr):
        start = time.time()
        for category in self.genre_dict.keys():
            print('Processing {}'.format(category))
            self.pipelines[category].fit(X_train, train_targets_arr[:,genre_dict[category]])
        end = time.time()
        print('Time to train ' + str(end-start) + ' seconds')

    def predict(self, X_test):
        Ret = np.zeros((X_test.shape[0],len(self.genre_dict.keys())), dtype='int')
        for category in self.genre_dict.keys():
            try:
                prediction = self.pipelines[category].predict(X_test)
            except: #exception we get is it was trained with data taht was only 0 label during cross validation
                prediction = np.zeros(X_test.shape[0], dtype=int)
            Ret[:,self.genre_dict[category]] = prediction
        return Ret
    
    #unbalanced data so allow prediction with given threshold
    def predict_threshold(self, X_test, threshold):
        Ret = np.zeros((X_test.shape[0],len(self.genre_dict.keys())), dtype='int')
        for category in self.genre_dict.keys():
            try:
                prediction = self.pipelines[category].predict_proba(X_test)[:,1]
            except:#exception we get is it was trained with data taht was only 0 label during cross validation
                prediction = np.zeros(X_test.shape[0], dtype=int)
            prediction[prediction >=threshold] = 1
            prediction[prediction < threshold] = 0
            Ret[:,self.genre_dict[category]] = prediction
        return Ret

Evaluation metric definitions and printing functions

In [None]:
#size of intersection of predicted and actual labels divided by size of their union for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

#size of intersection of predicted and actual labels divided by size of predicted set for each datapoint tested on
#sum those and divide by number of datapoints
#if no predicted labels, don't count that row towards the precision as that would be undefined
def multi_label_precision(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    precision_sum = 0
    num_rows = 0
    for row in range(intersection.shape[0]):
        if sum(predictions_labels_matrix[row]) > 0: #if there is at least one prediction for this row
            num_rows += 1
            precision_sum += sum(intersection[row]) / sum(predictions_labels_matrix[row])
    if num_rows == 0:
        return 0#no labels predicted at all will give us 0 precision as precision makes no sense here
    return precision_sum / num_rows

#size of intersection of predicted and actual labels divided by size of real label set for each datapoint tested on
#sum those and divide by number of datapoints
#all datapoints should have at least 1 real label in this data set
#vectorized for speed
def multi_label_recall(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    #sum(array.T) gets number of 1s in row
    row_wise_recall = sum(intersection.T) / sum(real_labels_matrix.T)
    return sum(row_wise_recall) / real_labels_matrix.shape[0]

#lower is better
def hamming_loss(real_labels_matrix, predictions_labels_matrix):
    return (np.logical_xor(real_labels_matrix, predictions_labels_matrix)).sum()/(real_labels_matrix.shape[0] * real_labels_matrix.shape[1])


In [None]:
def get_per_label_metrics(real_labels_matrix, predictions_labels_matrix):
    for genre in genre_dict.keys():
        index = genre_dict[genre]
        real_labels_vect = real_labels_matrix[:, index]
        prediction_vect = predictions_labels_matrix[:,index]
        print("Accuruacy for " + genre + ": " + str(accuracy_score(real_labels_vect, prediction_vect)))
        print("Precision for " + genre + ": " + str(precision_score(real_labels_vect, prediction_vect)))
        print("Recall for " + genre + ": " + str(recall_score(real_labels_vect, prediction_vect)))
        print()

In [None]:
def get_all_metrics(actual_labels, predictions):
    print('Getting evaluation metrics for each label:')
    get_per_label_metrics(actual_labels, predictions)
    print('Getting evaluations for multilabel problem')
    print('Multilabel accuracy: ' + str(multi_label_accuracy(actual_labels, predictions)))
    print('Multilabel precision: ' + str(multi_label_precision(actual_labels, predictions)))
    print('Multilabel recall: ' + str(multi_label_recall(actual_labels, predictions)))
    print("Percent of correctly decided label decisions: " + str(100* (1-hamming_loss(test_targets_arr, predictions))))

In [None]:
multi = MultiLabelLogisitcRegression(genre_dict)
multi.fit(X_train, train_targets_arr)
results = multi.predict(X_test)

In [None]:
get_all_metrics(test_targets_arr, results)

Notice the poor multilabel metrics despite high accuracy on each label when considered alone. Do cross validation to find better threshold than .5

In [None]:
#k fold cross validation with threshold
def kFoldCrossValidation(X, y, folds, threshold):
    print("Doing cross validation for threshold = " + str(threshold))
    held_out_size = len(X)//folds
    multi_label_acc = 0
    for i in range(folds):
        print("Iteration " + str(i+1) + " of " + str(folds) + " fold cross validation")
        held_out_index = i*held_out_size
        if i == folds-1:
            held_out_data = X[held_out_index:]
            held_out_y = y[held_out_index:]
            iter_training_data = X[0:held_out_index]
            iter_y = y[0:held_out_index]
        else:
            held_out_data = X[held_out_index:held_out_index+held_out_size]
            held_out_y = y[held_out_index:held_out_index+held_out_size]
            iter_training_data = np.append(X[0:held_out_index], X[held_out_index+held_out_size:], axis=0)
            iter_y = np.append(y[0:held_out_index], y[held_out_index+held_out_size:], axis=0)
        multi = MultiLabelLogisitcRegression(genre_dict)
        multi.fit(iter_training_data, iter_y)
        predictions = multi.predict_threshold(held_out_data, threshold)
        multi_label_acc += multi_label_accuracy(held_out_y, predictions)
    return multi_label_acc / folds #sum accross all folds and divide by number of folds

In [None]:
#find best threshold looking from .3 to .7 in ntervals of .05
test_threshold = .3
best_threshold_acc = 0
best_threshold = .5 #default is .5
start = time.time()
while test_threshold <= .7:
    acc = kFoldCrossValidation(X_train, train_targets_arr, 5, test_threshold)
    print("Accuracy: " + str(acc))
    if acc > best_threshold_acc:
        best_threshold_acc = acc
        best_threshold = test_threshold
    test_threshold += .05
end = time.time()
print('Time to run k fold cross validation to find best threshold ' + str(end-start))

In [None]:
print('Best threshold for multilabel accuracy: ' + str(best_threshold))
classifier = MultiLabelLogisitcRegression(genre_dict)
classifier.fit(X_train, train_targets_arr)
predictions = multi.predict_threshold(X_test, best_threshold)

In [None]:
get_all_metrics(test_targets_arr, predictions)