In [1]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [2]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)
genre_set = {'Comedy'}

In [3]:
def text_to_list(x):
    if pd.isna(x):
        return ''
    else:
        return ast.literal_eval(x)

def parse_json(x):
    try:
        return json.loads(x.replace("'", '"'))[0]['name']
    except:
        return ''
    
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #number of genres we are looking at
        for i in range(numElems):
            genre_str = (json_genres[i]['name'])
            if genre_str in genre_map.keys():
                ret[genre_dict[genre_map[genre_str]]] = 1
        return ret
    except Exception as excep:
        print('Exception' + str(excep))
        return ''

In [4]:
genre_dict = {}
genre_dict['Action-Adventure'] = 0
genre_dict['Romance'] = 1
genre_dict['Horror-Thriller'] = 2
genre_dict['Comedy'] = 3
genre_dict['Science Fiction'] = 4
#genre_dict['Drama'] = 5
genre_dict

{'Action-Adventure': 0,
 'Romance': 1,
 'Horror-Thriller': 2,
 'Comedy': 3,
 'Science Fiction': 4}

In [5]:
genre_map = {}
genre_map['Adventure'] = 'Action-Adventure'
genre_map['Romance'] = 'Romance'
genre_map['Horror'] = 'Horror-Thriller'
genre_map['Thriller'] = 'Horror-Thriller'
genre_map['Comedy'] = 'Comedy'
#genre_map['War'] = 'Action-Adventure'#not sure about this
genre_map['Action'] = 'Action-Adventure'
genre_map['Science Fiction'] = 'Science Fiction'
#genre_map['Drama'] = 'Drama'

In [6]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret
    return ret

In [7]:
labels_vects = getGenresVects() #get label vectors for genres indexed by indexes in genre_dict

In [8]:
#put to lower case, remove punctation
def cleanText(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text)
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text
all_data['cleanOverview'] = all_data['overview'].apply(cleanText)
all_data = all_data[all_data.genres_vect.map(sum) > 0]

In [9]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_vect']]

In [10]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

In [11]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [12]:
from tqdm import tqdm

In [13]:
X_train = train.cleanOverview
X_test = test.cleanOverview

In [14]:
train_targets_arr = train['genres_vect'].tolist()
train_targets_arr = np.array(train_targets_arr)

test_targets_arr = test['genres_vect'].tolist()
test_targets_arr = np.array(test_targets_arr)

In [15]:
class MultiLabelLogisitcRegression():
    def __init__(self, genre_dict):
        self.genre_dict = genre_dict
        self.pipelines = {}
        for category in self.genre_dict.keys():
            self.pipelines[category]=Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', multi_class='ovr'), n_jobs=1)),
            ])
        
    def fit(self, X_train, train_targets_arr):
        for category in self.genre_dict.keys():
            print('... Processing {}'.format(category))
            # train the model using X_dtm & y
            self.pipelines[category].fit(X_train, train_targets_arr[:,genre_dict[category]])
            # compute the testing accuracy
            '''
            prediction = self.pipelines[category].predict(X_test)
            print(prediction.sum())
            print(test_targets_arr[:,self.genre_dict[category]].sum())
            print('Test accuracy is {}'.format(accuracy_score(test_targets_arr[:,sel.fgenre_dict[category]], prediction)))
            print('Test precision is {}'.format(precision_score(test_targets_arr[:,self.genre_dict[category]], prediction)))
            print('Test recall is {}'.format(recall_score(test_targets_arr[:,self.genre_dict[category]], prediction)))   
            '''
    def predict(self, X_test):
        Ret = np.zeros((X_test.shape[0],len(self.genre_dict.keys())), dtype='int')
        for category in self.genre_dict.keys():
            prediction = self.pipelines[category].predict(X_test)
            Ret[:,self.genre_dict[category]] = prediction
        return Ret

In [16]:
#size of intersection of predicted and actual labels divided by size of their union for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

#size of intersection of predicted and actual labels divided by size of predicted set for each datapoint tested on
#sum those and divide by number of datapoints
#if no predicted labels, don't count that row towards the precision as that would be undefined
def multi_label_precision(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    precision_sum = 0
    num_rows = 0
    for row in range(intersection.shape[0]):
        if sum(predictions_labels_matrix[row]) > 0: #if there is at least one prediction for this row
            num_rows += 1
            precision_sum += sum(intersection[row]) / sum(predictions_labels_matrix[row])
    if num_rows == 0:
        return 0#no labels predicted at all will give us 0 precision as precision makes no sense here
    return precision_sum / num_rows

#size of intersection of predicted and actual labels divided by size of real label set for each datapoint tested on
#sum those and divide by number of datapoints
#all datapoints should have at least 1 real label in this data set
#vectorized for speed
def multi_label_recall(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    #sum(array.T) gets number of 1s in row
    row_wise_recall = sum(intersection.T) / sum(real_labels_matrix.T)
    return sum(row_wise_recall) / real_labels_matrix.shape[0]

#lower is better
def hamming_loss(real_labels_matrix, predictions_labels_matrix):
    return (np.logical_xor(real_labels_matrix, predictions_labels_matrix)).sum()/(real_labels_matrix.shape[0] * real_labels_matrix.shape[1])


In [17]:
def get_per_label_metrics(real_labels_matrix, predictions_labels_matrix):
    for genre in genre_dict.keys():
        index = genre_dict[genre]
        real_labels_vect = real_labels_matrix[:, index]
        prediction_vect = predictions_labels_matrix[:,index]
        print("Accuruacy for " + genre + ": " + str(accuracy_score(real_labels_vect, prediction_vect)))
        print("Precision for " + genre + ": " + str(precision_score(real_labels_vect, prediction_vect)))
        print("Recall for " + genre + ": " + str(recall_score(real_labels_vect, prediction_vect)))
        print()

In [18]:
multi = MultiLabelLogisitcRegression(genre_dict)
multi.fit(X_train, train_targets_arr)
results = multi.predict(X_test)

... Processing Action-Adventure
... Processing Romance
... Processing Horror-Thriller
... Processing Comedy
... Processing Science Fiction


In [19]:
get_per_label_metrics(test_targets_arr, results)

Accuruacy for Action-Adventure: 0.7329192546583851
Precision for Action-Adventure: 0.8452380952380952
Recall for Action-Adventure: 0.37967914438502676

Accuruacy for Romance: 0.7929606625258799
Precision for Romance: 0.8
Recall for Romance: 0.07547169811320754

Accuruacy for Horror-Thriller: 0.7060041407867494
Precision for Horror-Thriller: 0.8125
Recall for Horror-Thriller: 0.3385416666666667

Accuruacy for Comedy: 0.7163561076604554
Precision for Comedy: 0.8275862068965517
Recall for Comedy: 0.4507042253521127

Accuruacy for Science Fiction: 0.8840579710144928
Precision for Science Fiction: 0.0
Recall for Science Fiction: 0.0



  'precision', 'predicted', average, warn_for)


In [20]:
print("Percent of correctly decided label decisions: " + str(100* (1-hamming_loss(test_targets_arr, results))))

Percent of correctly decided label decisions: 76.64596273291926


In [21]:
multi_label_accuracy(test_targets_arr, results)

0.32263630089717044

In [22]:
multi_label_recall(test_targets_arr, results)

0.33402346445824704

In [23]:
multi_label_precision(test_targets_arr, results)

0.8287401574803149

Trying using doc2Vec instead