In [1]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [2]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)
genre_set = {'Comedy'}

In [3]:
def text_to_list(x):
    if pd.isna(x):
        return ''
    else:
        return ast.literal_eval(x)

def parse_json(x):
    try:
        return json.loads(x.replace("'", '"'))[0]['name']
    except:
        return ''
    
def parse_all_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        for i in range(numElems):
            genre_set.add(json_genres[i]['name'])
    except:
        return ''
    
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #20 0s
        for i in range(numElems):
            ret[genre_dict[(json_genres[i]['name'])]] = 1
        return ret
    except:
        return ''
    

def get_labels_as_strs(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = []#20 0s
        for i in range(numElems):
            ret.append(json_genres[i]['name'])
        return ret
    except:
        return ''

In [4]:
 def getAllGenres():
    full_data = pd.read_csv('train.csv')

    y = full_data['genres']
    y.apply(parse_all_genres_json)

In [5]:
getAllGenres()

In [6]:
len(genre_set)

20

In [7]:
#get set to dictionary for indexing of target vectors
genre_dict = {}
index = 0
for genre in genre_set:
    genre_dict[genre] = index
    index += 1

In [8]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret
    label_strs = y.apply(get_labels_as_strs)
    all_data['genres_labels'] = label_strs
    return ret

In [9]:
genres_vects = getGenresVects()

In [10]:
#put to lower case, remove punctation
def cleanText(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text)
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text
all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [11]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_labels', 'genres_vect']]

In [12]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

In [13]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [14]:
from tqdm import tqdm

In [15]:
X_train = train.cleanOverview
X_test = test.cleanOverview

In [16]:
train_targets_arr = train['genres_vect'].tolist()
train_targets_arr = np.array(train_targets_arr)

test_targets_arr = test['genres_vect'].tolist()
test_targets_arr = np.array(test_targets_arr)

In [17]:
class MultiLabelLogisitcRegression():
    def __init__(self, genre_dict):
        self.genre_dict = genre_dict
        self.pipelines = {}
        for category in self.genre_dict.keys():
            self.pipelines[category]=Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', multi_class='ovr'), n_jobs=1)),
            ])
        
    def fit(self, X_train, train_targets_arr):
        for category in self.genre_dict.keys():
            print('Processing {}'.format(category))
            self.pipelines[category].fit(X_train, train_targets_arr[:,genre_dict[category]])

    def predict(self, X_test):
        Ret = np.zeros((X_test.shape[0],len(self.genre_dict.keys())), dtype='int')
        for category in self.genre_dict.keys():
            prediction = self.pipelines[category].predict(X_test)
            Ret[:,self.genre_dict[category]] = prediction
        return Ret
    
    def predict_threshold(self, X_test, threshold):
        Ret = np.zeros((X_test.shape[0],len(self.genre_dict.keys())), dtype='int')
        for category in self.genre_dict.keys():
            prediction = self.pipelines[category].predict_proba(X_test)[:,1]
            prediction[prediction >=threshold] = 1
            prediction[prediction < threshold] = 0
            Ret[:,self.genre_dict[category]] = prediction
        return Ret

In [18]:
#size of intersection of predicted and actual labels divided by size of their union for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

#size of intersection of predicted and actual labels divided by size of predicted set for each datapoint tested on
#sum those and divide by number of datapoints
#if no predicted labels, don't count that row towards the precision as that would be undefined
def multi_label_precision(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    precision_sum = 0
    num_rows = 0
    for row in range(intersection.shape[0]):
        if sum(predictions_labels_matrix[row]) > 0: #if there is at least one prediction for this row
            num_rows += 1
            precision_sum += sum(intersection[row]) / sum(predictions_labels_matrix[row])
    if num_rows == 0:
        return 0#no labels predicted at all will give us 0 precision as precision makes no sense here
    return precision_sum / num_rows

#size of intersection of predicted and actual labels divided by size of real label set for each datapoint tested on
#sum those and divide by number of datapoints
#all datapoints should have at least 1 real label in this data set
#vectorized for speed
def multi_label_recall(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    #sum(array.T) gets number of 1s in row
    row_wise_recall = sum(intersection.T) / sum(real_labels_matrix.T)
    return sum(row_wise_recall) / real_labels_matrix.shape[0]

#lower is better
def hamming_loss(real_labels_matrix, predictions_labels_matrix):
    return (np.logical_xor(real_labels_matrix, predictions_labels_matrix)).sum()/(real_labels_matrix.shape[0] * real_labels_matrix.shape[1])


In [19]:
def get_per_label_metrics(real_labels_matrix, predictions_labels_matrix):
    for genre in genre_dict.keys():
        index = genre_dict[genre]
        real_labels_vect = real_labels_matrix[:, index]
        prediction_vect = predictions_labels_matrix[:,index]
        print("Accuruacy for " + genre + ": " + str(accuracy_score(real_labels_vect, prediction_vect)))
        print("Precision for " + genre + ": " + str(precision_score(real_labels_vect, prediction_vect)))
        print("Recall for " + genre + ": " + str(recall_score(real_labels_vect, prediction_vect)))
        print()

In [20]:
multi = MultiLabelLogisitcRegression(genre_dict)
multi.fit(X_train, train_targets_arr)
results = multi.predict(X_test)

Processing Romance
Processing Western
Processing Documentary
Processing Adventure
Processing Science Fiction
Processing Horror
Processing Thriller
Processing Comedy
Processing Music
Processing TV Movie
Processing Animation
Processing Action
Processing Mystery
Processing Fantasy
Processing Family
Processing History
Processing Drama
Processing Crime
Processing Foreign
Processing War


In [21]:
get_per_label_metrics(test_targets_arr, results)

Accuruacy for Romance: 0.8257956448911222
Precision for Romance: 0.8888888888888888
Recall for Romance: 0.07207207207207207

Accuruacy for Western: 0.983249581239531
Precision for Western: 0.0
Recall for Western: 0.0

Accuruacy for Documentary: 0.9731993299832495
Precision for Documentary: 0.0
Recall for Documentary: 0.0

Accuruacy for Adventure: 0.8408710217755444
Precision for Adventure: 0.0
Recall for Adventure: 0.0

Accuruacy for Science Fiction: 0.8961474036850922
Precision for Science Fiction: 0.0
Recall for Science Fiction: 0.0

Accuruacy for Horror: 0.9061976549413735
Precision for Horror: 0.0
Recall for Horror: 0.0

Accuruacy for Thriller: 0.7286432160804021
Precision for Thriller: 0.8888888888888888
Recall for Thriller: 0.047337278106508875

Accuruacy for Comedy: 0.6834170854271356
Precision for Comedy: 0.813953488372093
Recall for Comedy: 0.16203703703703703

Accuruacy for Music: 0.9748743718592965
Precision for Music: 0.0
Recall for Music: 0.0

Accuruacy for TV Movie: 1.0
P

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [22]:
print("Percent of correctly decided label decisions: " + str(100* (1-hamming_loss(test_targets_arr, results))))

Percent of correctly decided label decisions: 88.60134003350085


In [23]:
multi_label_accuracy(test_targets_arr, results)

0.2167185132009251

In [24]:
multi_label_recall(test_targets_arr, results)

0.22224615139188

In [25]:
multi_label_precision(test_targets_arr, results)

0.717564870259481