In [17]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [18]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)
genre_set = {'Comedy'}

In [19]:
def text_to_list(x):
    if pd.isna(x):
        return ''
    else:
        return ast.literal_eval(x)

def parse_json(x):
    try:
        return json.loads(x.replace("'", '"'))[0]['name']
    except:
        return ''
    
def parse_all_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        for i in range(numElems):
            genre_set.add(json_genres[i]['name'])
    except:
        return ''
    
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #20 0s
        for i in range(numElems):
            ret[genre_dict[(json_genres[i]['name'])]] = 1
        return ret
    except:
        return ''
    

def get_labels_as_strs(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = []#20 0s
        for i in range(numElems):
            ret.append(json_genres[i]['name'])
        return ret
    except:
        return ''

In [20]:
 def getAllGenres():
    full_data = pd.read_csv('train.csv')

    y = full_data['genres']
    y.apply(parse_all_genres_json)

In [21]:
getAllGenres()

In [22]:
len(genre_set)

20

In [23]:
genre_set

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

In [24]:
#get set to dictionary for indexing of target vectors
genre_dict = {}
index = 0
for genre in genre_set:
    genre_dict[genre] = index
    index += 1

In [25]:
genre_dict

{'Animation': 0,
 'Documentary': 1,
 'Music': 2,
 'History': 3,
 'Adventure': 4,
 'Crime': 5,
 'Foreign': 6,
 'Science Fiction': 7,
 'Western': 8,
 'Fantasy': 9,
 'Drama': 10,
 'Horror': 11,
 'Romance': 12,
 'Thriller': 13,
 'War': 14,
 'Comedy': 15,
 'Family': 16,
 'TV Movie': 17,
 'Action': 18,
 'Mystery': 19}

In [26]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret
    label_strs = y.apply(get_labels_as_strs)
    all_data['genres_labels'] = label_strs
    return ret

In [27]:
getGenresVects()

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
5       [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
6       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...
7       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
10      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
11      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
12      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
13      [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
14      [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...
15      [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
16      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...
17      [0, 0,

In [28]:
all_data['overview']

0       When Lou, who has become the "father of the In...
1       Mia Thermopolis is now a college graduate and ...
2       Under the direction of a ruthless instructor, ...
3       Vidya Bagchi (Vidya Balan) arrives in Kolkata ...
4       Marine Boy is the story of a former national s...
5       Pinocchio and his friends, a glow worm and a m...
6       A young girl buys an antique box at a yard sal...
7       A chronicle which provides a rare window into ...
8       After telling the story of Flint's last journe...
9       In "A Mighty Wind", director Christopher Guest...
10      When world heavyweight boxing champion, Apollo...
11      The members of the Lambda Lambda Lambda frater...
12      Lester Burnham, a depressed suburban father in...
13      Disenchanted with the movie industry, Chili Pa...
14      John Anderton is a top 'Precrime' cop in the l...
15      Novica is a mathematics champion in a Belgrade...
16      After attending the funeral of her grandmother...
17      In 185

Todo: apapend genresVect to pandas dataframe (not really necessary)

In [29]:
#put to lower case, remove punctation
def cleanText(text):
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text
all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [30]:
all_data['cleanOverview']

0       when lou who has become the father of the inte...
1       mia thermopolis is now a college graduate and ...
2       under the direction of a ruthless instructor a...
3       vidya bagchi vidya balan arrives in kolkata fr...
4       marine boy is the story of a former national s...
5       pinocchio and his friends a glow worm and a ma...
6       a young girl buys an antique box at a yard sal...
7       a chronicle which provides a rare window into ...
8       after telling the story of flints last journey...
9       in a mighty wind director christopher guest re...
10      when world heavyweight boxing champion apollo ...
11      the members of the lambda lambda lambda frater...
12      lester burnham a depressed suburban father in ...
13      disenchanted with the movie industry chili pal...
14      john anderton is a top precrime cop in the lat...
15      novica is a mathematics champion in a belgrade...
16      after attending the funeral of her grandmother...
17      in 185

In [31]:
all_data['genres_vect'][0]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

In [32]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_vect', 'overview', 'genres_labels']]
lr_data['g'] = lr_data.apply(lambda row: row['genres_vect'][0], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['overview']), tags=r.genres_labels), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['overview']), tags=r.genres_labels), axis=1)

In [34]:
train_tagged.values[0]

TaggedDocument(words=['dragon', 'tiger', 'gate', 'is', '2006', 'hong', 'kong', 'martial', 'arts-action', 'film', 'directed', 'by', 'wilson', 'yip', 'and', 'featuring', 'fight', 'choreography', 'by', 'donnie', 'yen', 'who', 'also', 'stars', 'in', 'the', 'film', 'the', 'film', 'is', 'based', 'on', 'the', 'popular', 'hong', 'kong', 'manhua', 'oriental', 'heroes', 'which', 'bears', 'the', 'same', 'chinese', 'name', 'as', 'the', 'movie'], tags=['Action', 'Thriller'])

In [35]:
import multiprocessing

cores = multiprocessing.cpu_count()

In [36]:
from tqdm import tqdm

In [37]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 2388/2388 [00:00<00:00, 1912910.23it/s]


In [38]:
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 2388/2388 [00:00<00:00, 2039918.12it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2480435.35it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2588782.10it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2441735.24it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2529292.41it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2547303.65it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2583440.28it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2434021.37it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2507761.13it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2500249.11it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2536337.79it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2413493.48it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2437575.55it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2343471.68it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2456105.43it/s]
100%|██████████| 2388/2388 [00:00<00:00, 2477980.69it/s]
100%|██████████| 2388/2388 [00:00<00:00, 1892310.21it/s]
100%|██████████| 2388/2388 [00:

In [39]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [40]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

logreg = LogisticRegression(n_jobs=1, C=1e5, multi_class = 'ovr')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.20435510887772193
Testing F1 score: 0.18971889179009932


  'precision', 'predicted', average, warn_for)


In [41]:
y_pred

array(['Animation', 'Comedy', 'Drama', 'Documentary', 'Family', 'Drama',
       'Comedy', 'Action', 'Comedy', 'Action', 'Action', 'Drama', 'Drama',
       'Drama', 'Crime', 'Comedy', 'Documentary', 'Comedy', 'Drama',
       'Drama', 'Crime', 'Drama', 'Crime', 'Comedy', 'Drama', 'Horror',
       'Action', 'Horror', 'Drama', 'Romance', 'Action', 'Action',
       'Action', 'Action', 'Documentary', 'Crime', 'Drama', 'Adventure',
       'Comedy', 'Thriller', 'Drama', 'Drama', 'Comedy', 'Action',
       'Action', 'Crime', 'Drama', 'Action', 'Drama', 'Action',
       'Thriller', 'Drama', 'Documentary', 'Comedy', 'Drama', 'Action',
       'Drama', 'Action', 'Action', 'Crime', 'Action', 'Comedy', 'Action',
       'Horror', 'Comedy', 'Drama', 'Action', 'Comedy', 'Comedy', 'Drama',
       'Comedy', 'Documentary', 'Drama', 'Drama', 'Comedy', 'Drama',
       'Comedy', 'Drama', 'Comedy', 'Action', 'Drama', 'Comedy', 'Crime',
       'Comedy', 'Drama', 'Comedy', 'Drama', 'Comedy', 'Comedy', 'Comedy',


In [42]:
test['genres_labels'].values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [43]:
true_positive = 0
for i in range(len(test['g'].values)):
    if test['g'].values[i] == 1 and y_pred[i] == 1:
        true_positive += 1
true_positive

0