In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from py2neo import Graph

### 1. Loading Data

In [5]:
df = pd.read_csv('../data/df.csv')

In [4]:
df.shape

(9125, 3)

## 2. Auxiliary Graph Functions

In [11]:
graph = Graph("bolt://34.201.68.240:33577", auth=("neo4j", "plexiglass-berth-acquisitions"))

In [12]:
def predict_genres(movie_id_node, genres_id):
    movie_emb = df[df['source']==movie_id_node]['embedding']
    movie_prep = str_to_features(movie_emb, float)
    movie_prediction = model.predict(movie_prep)[0]
    genre_ids['prob'] = pd.Series(movie_prediction)
    return genre_ids.sort_values('prob', ascending=False)

In [10]:
def get_movie_info(movie_id_node, graph):
    return graph.run("""
    MATCH (m:Movie)-[:IN_GENRE]->(g:Genre) WHERE id(m)={id} RETURN m.id, m.title, collect(g.name)
    """, {'id': movie_id_node}).to_data_frame()

In [160]:
def get_movie_id(title, graph):
    return graph.run("""
    MATCH (m:Movie) WHERE m.title={title} RETURN id(m) as id
    """, {'title': title}).data()[0]['id']

### Trying transforming the problem into several binary classification problems

In [180]:
genre_cols = genre_ids['g.name'].values

In [186]:
emb_cols = ['d' + str(i) for i in range(1, 101)]

In [193]:
new_df = df.join(pd.DataFrame(df.genres.str.strip("[]").str.split(", ").tolist(), columns=genre_cols, index=df.index, dtype=int))

In [194]:
new_df = new_df.join(pd.DataFrame(new_df.embedding.str.strip("[]").str.split(", ").tolist(), columns=emb_cols, index=new_df.index, dtype=float))

In [197]:
new_df = new_df.drop(['embedding', 'genres'], axis=1)

In [199]:
new_df

Unnamed: 0,source,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,d91,d92,d93,d94,d95,d96,d97,d98,d99,d100
0,0,0,0,1,1,1,1,0,0,0,...,4.866983,-1.421340,4.838544,-3.507505,-1.868826,-0.369160,1.107630,1.056987,-0.882342,-4.264614
1,5,0,0,1,0,1,0,0,0,0,...,5.732968,-0.516730,4.016620,-0.577164,-0.017154,-0.009479,1.634397,0.620139,-4.063956,-4.812520
2,7,0,0,0,0,0,1,0,0,0,...,4.395160,-0.904002,3.504909,0.375195,-1.743677,-0.212724,-0.710979,0.006432,0.609505,-0.453959
3,8,0,0,0,0,0,1,0,0,1,...,2.223411,2.859890,0.737771,0.251222,-1.521303,-0.713885,-1.579755,-4.259443,-4.009755,1.258942
4,11,0,0,0,0,0,1,0,0,0,...,3.670642,0.248816,5.397691,-1.053222,-0.000371,-1.802294,-1.750267,-3.469961,0.478070,-1.260770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,9140,0,0,1,0,0,0,0,0,1,...,0.674511,-1.599569,-2.585011,-3.509592,-0.676413,0.455164,-0.931492,-0.345227,-0.636330,-2.087504
9121,9141,0,1,1,0,0,0,0,0,0,...,-3.612969,1.423751,1.263315,-0.293734,-1.048563,-1.962560,0.914918,0.146634,0.557713,-2.739617
9122,9142,0,0,0,0,0,0,0,1,0,...,-1.614809,-1.497918,-1.566140,1.744732,-1.395121,-0.351850,1.300761,1.375659,1.729690,-0.228616
9123,9143,0,0,0,0,0,1,0,0,0,...,1.966081,-1.004988,-1.772244,0.721342,-2.838770,2.408803,2.469480,-1.471443,1.049527,-0.171556


In [201]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(new_df, random_state=42, test_size=0.20, shuffle=True)

print(train.shape)
print(test.shape)

(7300, 121)
(1825, 121)


In [202]:
x_train = train[emb_cols]
y_train = train[genre_cols]

x_test = test[emb_cols]
y_test = test[genre_cols]

In [209]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, balanced_accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [238]:
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])
models = {}

for genre in genre_cols:
    print('**Processing {} comments...**'.format(genre))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, y_train[genre])
    models[genre] = LogReg_pipeline
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Accuracy: {}; Balanced accuracy: {}; Precision: {}'.format(accuracy_score(y_test[genre], prediction), 
                                                                      balanced_accuracy_score(y_test[genre], prediction), 
                                                                      precision_score(y_test[genre], prediction, pos_label='1')))
    print("\n")


**Processing Adventure comments...**
Accuracy: 0.9989041095890411; Balanced accuracy: 0.8330589096231247; Precision: 0.6666666666666666


**Processing Animation comments...**
Accuracy: 0.9884931506849315; Balanced accuracy: 0.9850212945504411; Precision: 0.9511400651465798


**Processing Children comments...**
Accuracy: 0.9879452054794521; Balanced accuracy: 0.9790625451393904; Precision: 0.9327354260089686


**Processing Comedy comments...**
Accuracy: 0.9961643835616438; Balanced accuracy: 0.9887451207993354; Precision: 0.9523809523809523


**Processing Fantasy comments...**
Accuracy: 0.9961643835616438; Balanced accuracy: 0.9782277362406044; Precision: 0.9826086956521739


**Processing Romance comments...**
Accuracy: 0.9863013698630136; Balanced accuracy: 0.9849882902783743; Precision: 0.983941605839416


**Processing Drama comments...**
Accuracy: 0.9895890410958904; Balanced accuracy: 0.9837065943436449; Precision: 0.9357798165137615


**Processing Action comments...**
Accuracy: 0.9

In [226]:
get_movie_id('Zero Dark Thirty', graph)

8237

In [228]:
get_movie_info(8237, graph)

Unnamed: 0,m.id,m.title,collect(g.name)
0,,Zero Dark Thirty,"[Action, Thriller, Drama]"


In [235]:
movie_to_predict = new_df[new_df['source']==8237][emb_cols]

In [241]:
genre_prediction = {}
for genre in genre_cols:
    model = models[genre]
    genre_prediction[genre] = model.predict(movie_to_predict)

In [242]:
genre_prediction

{'Adventure': array(['0'], dtype='<U1'),
 'Animation': array(['0'], dtype='<U1'),
 'Children': array(['0'], dtype='<U1'),
 'Comedy': array(['0'], dtype='<U1'),
 'Fantasy': array(['0'], dtype='<U1'),
 'Romance': array(['0'], dtype='<U1'),
 'Drama': array(['0'], dtype='<U1'),
 'Action': array(['0'], dtype='<U1'),
 'Crime': array(['0'], dtype='<U1'),
 'Thriller': array(['0'], dtype='<U1'),
 'Horror': array(['0'], dtype='<U1'),
 'Mystery': array(['0'], dtype='<U1'),
 'Sci-Fi': array(['0'], dtype='<U1'),
 'Documentary': array(['0'], dtype='<U1'),
 'IMAX': array(['0'], dtype='<U1'),
 'War': array(['0'], dtype='<U1'),
 'Musical': array(['0'], dtype='<U1'),
 'Western': array(['0'], dtype='<U1'),
 'Film-Noir': array(['0'], dtype='<U1'),
 '(no genres listed)': array(['0'], dtype='<U1')}