In [23]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [24]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)
genre_set = {'Comedy'}

In [25]:
def text_to_list(x):
    if pd.isna(x):
        return ''
    else:
        return ast.literal_eval(x)

def parse_json(x):
    try:
        return json.loads(x.replace("'", '"'))[0]['name']
    except:
        return ''
    
def parse_all_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        for i in range(numElems):
            genre_set.add(json_genres[i]['name'])
    except:
        return ''
    
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #20 0s
        for i in range(numElems):
            ret[genre_dict[(json_genres[i]['name'])]] = 1
        return ret
    except:
        return ''
    

def get_labels_as_strs(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = []#20 0s
        for i in range(numElems):
            ret.append(json_genres[i]['name'])
        return ret
    except:
        return ''

In [26]:
 def getAllGenres():
    full_data = pd.read_csv('train.csv')

    y = full_data['genres']
    y.apply(parse_all_genres_json)

In [27]:
getAllGenres()

In [28]:
len(genre_set)

20

In [29]:
genre_set

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

In [30]:
#get set to dictionary for indexing of target vectors
genre_dict = {}
index = 0
for genre in genre_set:
    genre_dict[genre] = index
    index += 1

In [31]:
genre_dict

{'Romance': 0,
 'Thriller': 1,
 'Western': 2,
 'TV Movie': 3,
 'Crime': 4,
 'Music': 5,
 'War': 6,
 'Animation': 7,
 'Adventure': 8,
 'Action': 9,
 'Horror': 10,
 'Documentary': 11,
 'Science Fiction': 12,
 'Foreign': 13,
 'Fantasy': 14,
 'History': 15,
 'Family': 16,
 'Comedy': 17,
 'Drama': 18,
 'Mystery': 19}

In [32]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret
    label_strs = y.apply(get_labels_as_strs)
    all_data['genres_labels'] = label_strs
    return ret

In [33]:
getGenresVects()

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
5       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
6       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
7       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
8       [0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...
9       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
10      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
11      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
12      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
13      [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
14      [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...
15      [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
16      [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
17      [1, 0,

In [34]:
all_data['overview']

0       When Lou, who has become the "father of the In...
1       Mia Thermopolis is now a college graduate and ...
2       Under the direction of a ruthless instructor, ...
3       Vidya Bagchi (Vidya Balan) arrives in Kolkata ...
4       Marine Boy is the story of a former national s...
5       Pinocchio and his friends, a glow worm and a m...
6       A young girl buys an antique box at a yard sal...
7       A chronicle which provides a rare window into ...
8       After telling the story of Flint's last journe...
9       In "A Mighty Wind", director Christopher Guest...
10      When world heavyweight boxing champion, Apollo...
11      The members of the Lambda Lambda Lambda frater...
12      Lester Burnham, a depressed suburban father in...
13      Disenchanted with the movie industry, Chili Pa...
14      John Anderton is a top 'Precrime' cop in the l...
15      Novica is a mathematics champion in a Belgrade...
16      After attending the funeral of her grandmother...
17      In 185

Todo: apapend genresVect to pandas dataframe (not really necessary)

In [35]:
#put to lower case, remove punctation
def cleanText(text):
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text
all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [36]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_labels', 'genres_vect']]

In [37]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

In [38]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [39]:
from tqdm import tqdm

In [40]:
X_train = train.cleanOverview
X_test = test.cleanOverview

In [41]:
train_targets_arr = train['genres_vect'].tolist()
train_targets_arr = np.array(train_targets_arr)

test_targets_arr = test['genres_vect'].tolist()
test_targets_arr = np.array(test_targets_arr)

In [42]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', multi_class='ovr'), n_jobs=1)),
            ])

for category in genre_dict.keys():
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train_targets_arr[:,genre_dict[category]])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    print(prediction.sum())
    print(test_targets_arr[:,genre_dict[category]].sum())
    print('Test accuracy is {}'.format(accuracy_score(test_targets_arr[:,genre_dict[category]], prediction)))
    print('Test precision is {}'.format(precision_score(test_targets_arr[:,genre_dict[category]], prediction)))
    print('Test recall is {}'.format(recall_score(test_targets_arr[:,genre_dict[category]], prediction)))

... Processing Romance
9
111
Test accuracy is 0.8257956448911222
Test precision is 0.8888888888888888
Test recall is 0.07207207207207207
... Processing Thriller
10
169
Test accuracy is 0.7303182579564489
Test precision is 0.9
Test recall is 0.05325443786982249
... Processing Western
0
10
Test accuracy is 0.983249581239531
Test precision is 0.0
Test recall is 0.0
... Processing TV Movie


  'precision', 'predicted', average, warn_for)


0
0
Test accuracy is 1.0
Test precision is 0.0
Test recall is 0.0
... Processing Crime
0
95
Test accuracy is 0.8408710217755444
Test precision is 0.0
Test recall is 0.0
... Processing Music


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)


0
15
Test accuracy is 0.9748743718592965
Test precision is 0.0
Test recall is 0.0
... Processing War
0
21
Test accuracy is 0.964824120603015
Test precision is 0.0
Test recall is 0.0
... Processing Animation


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0
27
Test accuracy is 0.9547738693467337
Test precision is 0.0
Test recall is 0.0
... Processing Adventure


  'precision', 'predicted', average, warn_for)


0
95
Test accuracy is 0.8408710217755444
Test precision is 0.0
Test recall is 0.0
... Processing Action


  'precision', 'predicted', average, warn_for)


13
157
Test accuracy is 0.7554438860971524
Test precision is 0.9230769230769231
Test recall is 0.07643312101910828
... Processing Horror
0
56
Test accuracy is 0.9061976549413735
Test precision is 0.0
Test recall is 0.0
... Processing Documentary


  'precision', 'predicted', average, warn_for)


0
16
Test accuracy is 0.9731993299832495
Test precision is 0.0
Test recall is 0.0
... Processing Science Fiction


  'precision', 'predicted', average, warn_for)


0
62
Test accuracy is 0.8961474036850922
Test precision is 0.0
Test recall is 0.0
... Processing Foreign


  'precision', 'predicted', average, warn_for)


0
7
Test accuracy is 0.9882747068676717
Test precision is 0.0
Test recall is 0.0
... Processing Fantasy
0
46
Test accuracy is 0.9229480737018425
Test precision is 0.0
Test recall is 0.0
... Processing History


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0
23
Test accuracy is 0.9614740368509213
Test precision is 0.0
Test recall is 0.0
... Processing Family
0
46
Test accuracy is 0.9229480737018425
Test precision is 0.0
Test recall is 0.0
... Processing Comedy


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


43
216
Test accuracy is 0.6834170854271356
Test precision is 0.813953488372093
Test recall is 0.16203703703703703
... Processing Drama
301
319
Test accuracy is 0.661641541038526
Test precision is 0.6943521594684385
Test recall is 0.6551724137931034
... Processing Mystery
0
40
Test accuracy is 0.932998324958124
Test precision is 0.0
Test recall is 0.0


  'precision', 'predicted', average, warn_for)


Trying using doc2Vec instead