In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

### 1. Loading Data

In [29]:
df = pd.read_csv('data/df.csv')

In [30]:
df.shape

(9125, 3)

### 2. Preprocessing Data

In [31]:
train_index = int(len(df)*0.9)

In [40]:
df_train = df[:train_index]
df_test = df[train_index:]

In [33]:
def str_to_features(X, type=int):
    return pd.DataFrame(np.array([np.array(list(map(type, item[1:-1].split(", ")))) for item in X.values]))

In [42]:
X_train_pd = df_train['embedding']
X_train_pd = str_to_features(X_train_pd, float)
X_train_pd.columns = [str(col) for col in range(1, 101)]
y_train_pd = str_to_features(df_train['genres'])

X_test_pd = df_test['embedding']
X_test_pd = str_to_features(X_test_pd, float)
X_test_pd.columns = [str(col) for col in range(1, 101)]
y_test_pd = str_to_features(df_test['genres'])

In [43]:
X_train = X_train_pd.to_numpy()
X_test = X_test_pd.to_numpy()
y_train = y_train_pd.to_numpy()
y_test = y_test_pd.to_numpy()

### 3. Model building: NN for Multi-label Classification with Keras

In [15]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(20, activation='sigmoid'))

In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 256)               25856     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                5140      
Total params: 30,996
Trainable params: 30,996
Non-trainable params: 0
_________________________________________________________________


In [17]:
sgd = tf.keras.optimizers.SGD(learning_rate=0.01)
loss='binary_crossentropy'
model.compile(loss=loss, optimizer=sgd, metrics=['accuracy', tf.keras.metrics.AUC(name='my_auc'), tf.keras.metrics.Recall()])

In [18]:
model.fit(X_train, y_train, epochs=100, batch_size=500)

Train on 8212 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

<tensorflow.python.keras.callbacks.History at 0x7f3d51fb4310>

In [19]:
eval_results = model.evaluate(X_test, y_test, batch_size=500)



In [20]:
eval_results

[0.15040157397501047, 0.95005494, 0.95527303, 0.61495423]

In [21]:
model.save('data/model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: graph/model/assets


### 4. Evaluation of the Model

In [26]:
model = tf.keras.models.load_model('data/model')

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 256)               25856     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                5140      
Total params: 30,996
Trainable params: 30,996
Non-trainable params: 0
_________________________________________________________________


In [47]:
df_test[df_test['source'] == 9133]

Unnamed: 0,source,embedding,genres
9113,9133,"[1.3653257, 0.4937028, -0.54120785, -0.6435545...","[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [23]:
#9113
from py2neo import Graph

In [24]:
graph = Graph("bolt://34.201.68.240:33498", auth=("neo4j", "hickory-approval-forties"))

In [74]:
genre_ids = graph.run("""
MATCH (g:Genre)
RETURN id(g), g.name
""").to_data_frame()

#### Test #1: "Now You See Me 2"

In [67]:
graph.run("""
MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)
WHERE id(m)=9113 RETURN id(m), m.title, collect(g.name) as genres, collect(id(g)) as genres_id, m.embedding
""").to_data_frame()

Unnamed: 0,id(m),m.title,genres,genres_id,m.embedding
0,9113,Now You See Me 2,"[Comedy, Thriller, Action]","[4, 16, 13]","[0.7736787, 0.27281216, 1.1771479, 0.7808156, ..."


In [68]:
movie_now_emb = df_test[df_test['source']==9113]['embedding']
movie_now_prep = str_to_features(movie_now_emb, float)
movie_now_prep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.773679,0.272812,1.177148,0.780816,0.753971,1.493811,3.064326,-0.301018,-1.035773,-1.643256,...,-0.30095,-3.311646,0.521261,-0.958329,-2.385756,-0.527707,-0.580122,-2.701114,3.126047,-1.200813


In [69]:
movie_now_prediction = model.predict(movie_now_prep)[0]

In [95]:
movie_now_prediction

array([0.01416788, 0.5349818 , 0.06474057, 0.03417473, 0.02288392,
       0.5535826 , 0.23779865, 0.05297919, 0.27175906, 0.05225568,
       0.04346392, 0.12970161, 0.04717398, 0.00822304, 0.24785383,
       0.07106901, 0.11238075, 0.6537531 , 0.06051804, 0.01979449],
      dtype=float32)

In [105]:
genre_ids['prob'] = pd.Series(movie_now_prediction)

Unnamed: 0,id(g),g.name,prob
0,1,Adventure,0.014168
1,2,Animation,0.534982
2,3,Children,0.064741
3,4,Comedy,0.034175
4,6,Fantasy,0.022884
5,9,Romance,0.553583
6,10,Drama,0.237799
7,13,Action,0.052979
8,14,Crime,0.271759
9,16,Thriller,0.052256


In [108]:
genre_ids.sort_values('prob', ascending=False)

Unnamed: 0,id(g),g.name,prob
17,161,Western,0.653753
5,9,Romance,0.553583
1,2,Animation,0.534982
8,14,Crime,0.271759
14,51,IMAX,0.247854
6,10,Drama,0.237799
11,33,Mystery,0.129702
16,63,Musical,0.112381
15,56,War,0.071069
2,3,Children,0.064741


In [156]:
def predict_genres(movie_id_node, genres_id):
    movie_emb = df[df['source']==movie_id_node]['embedding']
    movie_prep = str_to_features(movie_emb, float)
    movie_prediction = model.predict(movie_prep)[0]
    genre_ids['prob'] = pd.Series(movie_prediction)
    return genre_ids.sort_values('prob', ascending=False)

In [157]:
def get_movie_info(movie_id_node, graph):
    return graph.run("""
    MATCH (m:Movie)-[:IN_GENRE]->(g:Genre) WHERE id(m)={id} RETURN m.id, m.title, collect(g.name)
    """, {'id': movie_id_node}).to_data_frame()

#### Test #2: "Zero Dark Thirty"

In [158]:
get_movie_info(8237, graph)

Unnamed: 0,m.id,m.title,collect(g.name)
0,,Zero Dark Thirty,"[Action, Thriller, Drama]"


In [159]:
predict_genres(8247, genre_ids)

Unnamed: 0,id(g),g.name,prob
8,14,Crime,0.876467
5,9,Romance,0.38044
1,2,Animation,0.246542
15,56,War,0.140582
18,162,Film-Noir,0.131006
17,161,Western,0.097802
10,23,Horror,0.04752
11,33,Mystery,0.04342
2,3,Children,0.042514
6,10,Drama,0.041361


#### Test #3: "Toy Story"

In [160]:
def get_movie_id(title, graph):
    return graph.run("""
    MATCH (m:Movie) WHERE m.title={title} RETURN id(m) as id
    """, {'title': title}).data()[0]['id']

In [161]:
get_movie_id('Toy Story', graph)

0

In [162]:
get_movie_info(0, graph)

Unnamed: 0,m.id,m.title,collect(g.name)
0,,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"


In [163]:
predict_genres(11, graph)

Unnamed: 0,id(g),g.name,prob
5,9,Romance,0.872234
8,14,Crime,0.317467
2,3,Children,0.291814
15,56,War,0.248416
1,2,Animation,0.197422
14,51,IMAX,0.194053
6,10,Drama,0.122448
3,4,Comedy,0.087416
16,63,Musical,0.087406
9,16,Thriller,0.039765


#### Test #4: Jumanji

In [164]:
get_movie_id('Jumanji', graph)

5

In [165]:
get_movie_info(5, graph)

Unnamed: 0,m.id,m.title,collect(g.name)
0,,Jumanji,"[Adventure, Children, Fantasy]"


In [166]:
predict_genres(5, graph)

Unnamed: 0,id(g),g.name,prob
2,3,Children,0.680673
1,2,Animation,0.455555
5,9,Romance,0.332281
16,63,Musical,0.294771
9,16,Thriller,0.236778
4,6,Fantasy,0.213477
15,56,War,0.166445
17,161,Western,0.138769
8,14,Crime,0.092888
14,51,IMAX,0.06833


### Trying transforming the problem into several binary classification problems

In [180]:
genre_cols = genre_ids['g.name'].values

In [186]:
emb_cols = ['d' + str(i) for i in range(1, 101)]

In [193]:
new_df = df.join(pd.DataFrame(df.genres.str.strip("[]").str.split(", ").tolist(), columns=genre_cols, index=df.index, dtype=int))

In [194]:
new_df = new_df.join(pd.DataFrame(new_df.embedding.str.strip("[]").str.split(", ").tolist(), columns=emb_cols, index=new_df.index, dtype=float))

In [197]:
new_df = new_df.drop(['embedding', 'genres'], axis=1)

In [199]:
new_df

Unnamed: 0,source,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,d91,d92,d93,d94,d95,d96,d97,d98,d99,d100
0,0,0,0,1,1,1,1,0,0,0,...,4.866983,-1.421340,4.838544,-3.507505,-1.868826,-0.369160,1.107630,1.056987,-0.882342,-4.264614
1,5,0,0,1,0,1,0,0,0,0,...,5.732968,-0.516730,4.016620,-0.577164,-0.017154,-0.009479,1.634397,0.620139,-4.063956,-4.812520
2,7,0,0,0,0,0,1,0,0,0,...,4.395160,-0.904002,3.504909,0.375195,-1.743677,-0.212724,-0.710979,0.006432,0.609505,-0.453959
3,8,0,0,0,0,0,1,0,0,1,...,2.223411,2.859890,0.737771,0.251222,-1.521303,-0.713885,-1.579755,-4.259443,-4.009755,1.258942
4,11,0,0,0,0,0,1,0,0,0,...,3.670642,0.248816,5.397691,-1.053222,-0.000371,-1.802294,-1.750267,-3.469961,0.478070,-1.260770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,9140,0,0,1,0,0,0,0,0,1,...,0.674511,-1.599569,-2.585011,-3.509592,-0.676413,0.455164,-0.931492,-0.345227,-0.636330,-2.087504
9121,9141,0,1,1,0,0,0,0,0,0,...,-3.612969,1.423751,1.263315,-0.293734,-1.048563,-1.962560,0.914918,0.146634,0.557713,-2.739617
9122,9142,0,0,0,0,0,0,0,1,0,...,-1.614809,-1.497918,-1.566140,1.744732,-1.395121,-0.351850,1.300761,1.375659,1.729690,-0.228616
9123,9143,0,0,0,0,0,1,0,0,0,...,1.966081,-1.004988,-1.772244,0.721342,-2.838770,2.408803,2.469480,-1.471443,1.049527,-0.171556


In [201]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(new_df, random_state=42, test_size=0.20, shuffle=True)

print(train.shape)
print(test.shape)

(7300, 121)
(1825, 121)


In [202]:
x_train = train[emb_cols]
y_train = train[genre_cols]

x_test = test[emb_cols]
y_test = test[genre_cols]

In [209]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, balanced_accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [238]:
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])
models = {}

for genre in genre_cols:
    print('**Processing {} comments...**'.format(genre))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, y_train[genre])
    models[genre] = LogReg_pipeline
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Accuracy: {}; Balanced accuracy: {}; Precision: {}'.format(accuracy_score(y_test[genre], prediction), 
                                                                      balanced_accuracy_score(y_test[genre], prediction), 
                                                                      precision_score(y_test[genre], prediction, pos_label='1')))
    print("\n")


**Processing Adventure comments...**
Accuracy: 0.9989041095890411; Balanced accuracy: 0.8330589096231247; Precision: 0.6666666666666666


**Processing Animation comments...**
Accuracy: 0.9884931506849315; Balanced accuracy: 0.9850212945504411; Precision: 0.9511400651465798


**Processing Children comments...**
Accuracy: 0.9879452054794521; Balanced accuracy: 0.9790625451393904; Precision: 0.9327354260089686


**Processing Comedy comments...**
Accuracy: 0.9961643835616438; Balanced accuracy: 0.9887451207993354; Precision: 0.9523809523809523


**Processing Fantasy comments...**
Accuracy: 0.9961643835616438; Balanced accuracy: 0.9782277362406044; Precision: 0.9826086956521739


**Processing Romance comments...**
Accuracy: 0.9863013698630136; Balanced accuracy: 0.9849882902783743; Precision: 0.983941605839416


**Processing Drama comments...**
Accuracy: 0.9895890410958904; Balanced accuracy: 0.9837065943436449; Precision: 0.9357798165137615


**Processing Action comments...**
Accuracy: 0.9

In [226]:
get_movie_id('Zero Dark Thirty', graph)

8237

In [228]:
get_movie_info(8237, graph)

Unnamed: 0,m.id,m.title,collect(g.name)
0,,Zero Dark Thirty,"[Action, Thriller, Drama]"


In [235]:
movie_to_predict = new_df[new_df['source']==8237][emb_cols]

In [241]:
genre_prediction = {}
for genre in genre_cols:
    model = models[genre]
    genre_prediction[genre] = model.predict(movie_to_predict)

In [242]:
genre_prediction

{'Adventure': array(['0'], dtype='<U1'),
 'Animation': array(['0'], dtype='<U1'),
 'Children': array(['0'], dtype='<U1'),
 'Comedy': array(['0'], dtype='<U1'),
 'Fantasy': array(['0'], dtype='<U1'),
 'Romance': array(['0'], dtype='<U1'),
 'Drama': array(['0'], dtype='<U1'),
 'Action': array(['0'], dtype='<U1'),
 'Crime': array(['0'], dtype='<U1'),
 'Thriller': array(['0'], dtype='<U1'),
 'Horror': array(['0'], dtype='<U1'),
 'Mystery': array(['0'], dtype='<U1'),
 'Sci-Fi': array(['0'], dtype='<U1'),
 'Documentary': array(['0'], dtype='<U1'),
 'IMAX': array(['0'], dtype='<U1'),
 'War': array(['0'], dtype='<U1'),
 'Musical': array(['0'], dtype='<U1'),
 'Western': array(['0'], dtype='<U1'),
 'Film-Noir': array(['0'], dtype='<U1'),
 '(no genres listed)': array(['0'], dtype='<U1')}