In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
from py2neo import Graph
from sklearn.model_selection import train_test_split

### 1. Loading Data

In [5]:
df = pd.read_csv('../data/data.csv')

In [6]:
df.shape

(9125, 3)

### 2. Auxiliary Graph Functions

In [7]:
PATH_CONNECTION = "../env/neo4j_connection.json"
connection_details = {}
with open(PATH_CONNECTION) as connection_file:
    connection_file_read = json.load(connection_file)
    connection_details['bolt_url'] = connection_file_read['bolt_url']
    connection_details['password'] = connection_file_read['password']

In [8]:
graph = Graph(connection_details['bolt_url'], auth=("neo4j", connection_details['password']))

In [10]:
def get_movie_info(movie_id_node, graph):
    return graph.run("""
    MATCH (m:Movie)-[:IN_GENRE]->(g:Genre) WHERE id(m)={id} RETURN m.id, m.title, collect(g.name)
    """, {'id': movie_id_node}).to_data_frame()

In [11]:
class NodeNotFound(Exception):
    pass

def get_movie_id(title, graph):
    result = graph.run("""
    MATCH (m:Movie) WHERE m.title={title} RETURN id(m) as id
    """, {'title': title}).data()
    
    if len(result) == 0:
        raise NodeNotFound("Movie with title {} not found in graph [{}].".format(title, graph))
    else:
        return result[0]['id']


### 3. Data Preparation

In [12]:
genre_cols = graph.run("""
MATCH (genre:Genre)
RETURN genre.name ORDER BY genre.name
""").to_data_frame()['genre.name'].values

In [13]:
emb_cols = ['d' + str(i) for i in range(1, 101)]
new_df = df.join(pd.DataFrame(df.genres.str.strip("[]").str.split(", ").tolist(), columns=genre_cols, index=df.index, dtype=int))
new_df = new_df.join(pd.DataFrame(new_df.embedding.str.strip("[]").str.split(", ").tolist(), columns=emb_cols, index=new_df.index, dtype=float))
new_df[genre_cols] = new_df[genre_cols].apply(pd.to_numeric, errors='raise')
new_df = new_df.drop(['embedding', 'genres'], axis=1)
data = new_df
del new_df

In [14]:
data

Unnamed: 0,source,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,d91,d92,d93,d94,d95,d96,d97,d98,d99,d100
0,0,0,0,1,1,1,1,0,0,0,...,4.866983,-1.421340,4.838544,-3.507505,-1.868826,-0.369160,1.107630,1.056987,-0.882342,-4.264614
1,5,0,0,1,0,1,0,0,0,0,...,5.732968,-0.516730,4.016620,-0.577164,-0.017154,-0.009479,1.634397,0.620139,-4.063956,-4.812520
2,7,0,0,0,0,0,1,0,0,0,...,4.395160,-0.904002,3.504909,0.375195,-1.743677,-0.212724,-0.710979,0.006432,0.609505,-0.453959
3,8,0,0,0,0,0,1,0,0,1,...,2.223411,2.859890,0.737771,0.251222,-1.521303,-0.713885,-1.579755,-4.259443,-4.009755,1.258942
4,11,0,0,0,0,0,1,0,0,0,...,3.670642,0.248816,5.397691,-1.053222,-0.000371,-1.802294,-1.750267,-3.469961,0.478070,-1.260770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,9140,0,0,1,0,0,0,0,0,1,...,0.674511,-1.599569,-2.585011,-3.509592,-0.676413,0.455164,-0.931492,-0.345227,-0.636330,-2.087504
9121,9141,0,1,1,0,0,0,0,0,0,...,-3.612969,1.423751,1.263315,-0.293734,-1.048563,-1.962560,0.914918,0.146634,0.557713,-2.739617
9122,9142,0,0,0,0,0,0,0,1,0,...,-1.614809,-1.497918,-1.566140,1.744732,-1.395121,-0.351850,1.300761,1.375659,1.729690,-0.228616
9123,9143,0,0,0,0,0,1,0,0,0,...,1.966081,-1.004988,-1.772244,0.721342,-2.838770,2.408803,2.469480,-1.471443,1.049527,-0.171556


In [34]:
df_features = data[emb_cols]
df_target = data[genre_cols]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, test_size=0.15, random_state=42, shuffle=True)

In [36]:
print("Shape of X_train: {}; shape of y_train: {}".format(X_train.shape,y_train.shape))
print("Shape of X_test: {}; shape of y_test: {}".format(X_test.shape,y_test.shape))

Shape of X_train: (7756, 100); shape of y_train: (7756, 20)
Shape of X_test: (1369, 100); shape of y_test: (1369, 20)


### 4. Model Building

#### 4.1 One vs Rest

In [42]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, label_ranking_average_precision_score
from sklearn.multiclass import OneVsRestClassifier

In [108]:
model_one_vs_rest = OneVsRestClassifier(SVC()).fit(X_train, y_train)

In [110]:
y_pred = model_one_vs_rest.predict(X_test)
print('Accuracy of the model One vs. Rest: {}'.format(accuracy_score(y_test, y_pred)))

Accuracy of the model One vs. Rest: 0.8897005113221329


Accuracy is a very harsh metric since it ignores the partially correct predictions

In [113]:
label_ranking_average_precision_score(y_pred, y_test) # greater than 0, best is 1

0.9600168121789739

#### 4.2 Ensemble of Single-Label Binary Classifiers

In [49]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [124]:
model_binary_relevance = BinaryRelevance(classifier=SVC())

In [125]:
model_binary_relevance.fit(X_train, y_train)

BinaryRelevance(classifier=SVC(C=1.0, break_ties=False, cache_size=200,
                               class_weight=None, coef0=0.0,
                               decision_function_shape='ovr', degree=3,
                               gamma='scale', kernel='rbf', max_iter=-1,
                               probability=False, random_state=None,
                               shrinking=True, tol=0.001, verbose=False),
                require_dense=[True, True])

In [126]:
y_pred = model_binary_relevance.predict(X_test)
print('Accuracy of the model Binary Relevance: {}'.format(accuracy_score(y_test, y_pred)))

Accuracy of the model Binary Relevance: 0.8897005113221329


In [127]:
label_ranking_average_precision_score(y_pred.toarray(), y_test) # greater than 0, best is 1

0.9600168121789739

#### 4.2 Classifier Chains

In [131]:
model_classifier_chainltilearn.problem_transform import ClassifierChain

In [133]:
model_classifier_chain = ClassifierChain(SVC())
model_classifier_chain.fit(X_train, y_train)

ClassifierChain(classifier=SVC(C=1.0, break_ties=False, cache_size=200,
                               class_weight=None, coef0=0.0,
                               decision_function_shape='ovr', degree=3,
                               gamma='scale', kernel='rbf', max_iter=-1,
                               probability=False, random_state=None,
                               shrinking=True, tol=0.001, verbose=False),
                order=None, require_dense=[True, True])

In [135]:
y_pred = model_classifier_chain.predict(X_test)
print('Accuracy of the model Binary Relevance: {}'.format(accuracy_score(y_test, y_pred)))

Accuracy of the model Binary Relevance: 0.8882395909422937


### 5. Model Building Using SKMultilearn

In [1]:
PATH_EDGE_LIST = "../data/movies.edgelist"

#### 5.1 Binary Relevance multi-label classifier based on k-Nearest Neighbors method:

In [30]:
import skmultilearn.adapt as skm_a
from sklearn.model_selection import GridSearchCV

In [38]:
parameters = {'k': range(1,3)}
score = 'f1_macro'

clf = GridSearchCV(skm_a.BRkNNaClassifier(), parameters, scoring=score)
clf.fit(X_train.values, y_train.values)

GridSearchCV(cv=None, error_score=nan, estimator=BRkNNaClassifier(k=10),
             iid='deprecated', n_jobs=None, param_grid={'k': range(1, 3)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=0)

In [40]:
clf.best_estimator_

BRkNNaClassifier(k=1)

In [43]:
y_pred = clf.predict(X_test)
print('Accuracy of the model Binary Relevance: {}'.format(accuracy_score(y_test, y_pred)))

Accuracy of the model Binary Relevance: 0.32213294375456536


In [44]:
label_ranking_average_precision_score(y_pred.toarray(), y_test) # greater than 0, best is 1

0.6099570709705813

#### 5.2 Label Space Partitioning Classifier + Label Powerset

In [45]:
from skmultilearn.cluster import MatrixLabelSpaceClusterer
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
from skmultilearn.problem_transform import LabelPowerset
from sklearn.cluster import KMeans

In [46]:
matrix_clusterer = MatrixLabelSpaceClusterer(clusterer=KMeans(n_clusters=2))

In [47]:
matrix_clusterer.fit_predict(X_train, y_train)

array([list([0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
       list([8])], dtype=object)

In [51]:
classifier = LabelSpacePartitioningClassifier(
    classifier = LabelPowerset(classifier=GaussianNB()),
    clusterer = matrix_clusterer
)

In [52]:
classifier.fit(X_train, y_train)

LabelSpacePartitioningClassifier(classifier=LabelPowerset(classifier=GaussianNB(priors=None,
                                                                                var_smoothing=1e-09),
                                                          require_dense=[True,
                                                                         True]),
                                 clusterer=MatrixLabelSpaceClusterer(clusterer=KMeans(algorithm='auto',
                                                                                      copy_x=True,
                                                                                      init='k-means++',
                                                                                      max_iter=300,
                                                                                      n_clusters=2,
                                                                                      n_init=10,
                                         

In [53]:
y_pred = classifier.predict(X_test)

In [54]:
print('Accuracy of the model LabelPowerset: {}'.format(accuracy_score(y_test, y_pred)))

Accuracy of the model LabelPowerset: 0.5149744338933528


In [55]:
label_ranking_average_precision_score(y_pred.toarray(), y_test) # greater than 0, best is 1

0.7752770531148884

#### 5.3 Estimating hyper-parameter k for embedded classifiers

In [None]:
from skmultilearn.problem_transform import ClassifierChain, LabelPowerset
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.cluster import NetworkXLabelGraphClusterer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

parameters = {
    'classifier': [LabelPowerset(), ClassifierChain()],
    'classifier__classifier': [RandomForestClassifier()],
    'classifier__classifier__n_estimators': [10, 20, 50],
    'clusterer' : [
        NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
        NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
    ]
}

clf = GridSearchCV(LabelSpacePartitioningClassifier(), parameters, scoring = 'f1_macro')
clf.fit(X_train, y_train)

print (clf.best_params_, clf.best_score_)

#### TODO 5.4 Detecting communities in Label Relations Graph (using skmultilearn and openNE)