In [None]:
from tqdm import tqdm, tqdm_notebook
from gensim import utils
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
from gensim.models import Doc2Vec
import neural_structured_learning as nsl
import networkx as nx
import tensorflow as tf
from word_index import word_index
from word_index_reverse import word_index_reverse

import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
warnings.filterwarnings('ignore')

In [None]:
%%time
df = pd.read_pickle('../data/preprocessing.pkl')

In [None]:
df.to_pickle("./doc_vectors.pkl")

### Doc2Vec

In [None]:
tagged_docs = [TaggedDocument(row.tokens_rest, [row.ID]) for index, row in tqdm(df.iterrows())]

In [None]:
model = Doc2Vec(vector_size = 128)

In [None]:
%%time
model.build_vocab(tagged_docs)

In [None]:
%%time
model.train(tagged_docs, total_examples=model.corpus_count, epochs= 10)

In [None]:
from gensim.test.utils import get_tmpfile

fname = './CAPP_model'
model.save(fname)

In [None]:
doc_vectors = {ID: model.docvecs[ID] for ID in df.ID}
df['doc_vectors'] = df.ID.map(doc_vectors)

In [None]:
data = df[df.main_labels.notnull()]
labels_value_counts = data.main_labels.value_counts()
quantiles = [0, 0.25, 0.5, 0.75, 1]

plt.figure(figsize = (12, 10))
for i in range(len(quantiles) - 1):
    
    quantile_down = data.main_labels.value_counts().quantile(quantiles[i])
    quantile_up = data.main_labels.value_counts().quantile(quantiles[i + 1])
    D = data[data.main_labels.isin(labels_value_counts[labels_value_counts > quantile_down][labels_value_counts <= quantile_up].index)]
    
    plt.subplot(2, 2, i + 1)
    plt.title(f'Number of decisions per labels between {int(quantile_down)} and {int(quantile_up)}', fontweight = 'bold')

    
    M_mean = list()
    M_median = list()
    for label in D.main_labels.unique():
        M = cosine_similarity(D.doc_vectors[D.main_labels == label].apply(pd.Series))
        M_values = M[np.tril_indices_from(M, 1)]
        M_mean.append(np.mean(M_values))
        
    sns.distplot(M_mean, hist = False)
        
    plt.xlabel(f'Mean: {np.mean(M_mean), 2}')
    plt.yticks([])

plt.suptitle('Distribution of cosine similarity', fontsize=16, fontweight = 'bold')
plt.show()

### Import dataset

In [None]:
filepath = '../train_test_sets/'
X_train = pd.read_pickle(filepath + 'X_train.pkl').reset_index(drop = True)
X_test = pd.read_pickle(filepath + 'X_test.pkl').reset_index(drop = True)
y_train = pickle.load(open(filepath + "y_train.pkl", "rb" ))
y_test = pickle.load(open(filepath + "y_test.pkl", "rb" ))

`num_words=87148`

### Graph

In [None]:
def _int64_feature(value):
    """Returns int64 tf.train.Feature."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value.tolist()))


def _bytes_feature(value):
    """Returns bytes tf.train.Feature."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))


def _float_feature(value):
    """Returns float tf.train.Feature."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value.tolist()))

In [None]:
def create_embedding_example(word_vector, record_id):
    """Create tf.Example containing the sample's embedding and its ID."""
    
    features = {'id': _bytes_feature(str(record_id)),
                'embedding': _float_feature(word_vector)}
    
    return tf.train.Example(features=tf.train.Features(feature=features))

def create_embeddings(model, IDs, output_path):
    with tf.io.TFRecordWriter(output_path) as writer:
        for ID in IDs:
            example = create_embedding_example(model.docvecs[ID], ID)
            writer.write(example.SerializeToString())

In [None]:
%%time
model = Doc2Vec.load('../model/CAPP_model')
create_embeddings(model = model, IDs = X_train.ID, output_path = '../outputs/train_embeddings.tfr')

In [None]:
%%time
nsl.tools.build_graph(['../outputs/train_embeddings.tfr'],
                      '../outputs/train_graph_60.tsv',
                      similarity_threshold=0.60)

In [None]:
def build_nx_graph(filepath):
    df = pd.read_csv(filepath, delimiter = '\t', header = None)
    df = df.iloc[::2, :2]
    dic_ = {}
    for i in tqdm(df[0].unique()):
        dic_[i] = df[df[0] == i][1].values.tolist()
    G = nx.Graph(dic_)
    return G

In [None]:
G = build_nx_graph('../outputs/train_graph_60.tsv')

In [None]:
print(f'Number of nodes: {G.number_of_nodes()}\nNumber of edges: {G.number_of_edges()}')

In [None]:
plt.figure(figsize = (12, 10))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, node_size = 6)
plt.title('Synthetized graph', loc = 'left', fontweight = 'bold')
plt.show()

### Generate training data

In [None]:
def create_example(word_vector, label, record_id):
    """Create tf.Example containing the sample's word vector, label, and ID."""
    features = {'id': _bytes_feature(str(record_id)),
                'words': _int64_feature(np.asarray(word_vector)),
                'label': _int64_feature(np.asarray([label]))}
    return tf.train.Example(features=tf.train.Features(feature=features))

def create_records(data, labels, record_path):
    with tf.io.TFRecordWriter(record_path) as writer:
        for word_vector, record_id, label in tqdm(zip(data.token_vectors, data.ID, labels)):
            example = create_example(word_vector, label, record_id)
            writer.write(example.SerializeToString())

In [None]:
create_records(X_train, y_train, '../outputs/train_data.tfr')
create_records(X_test, y_test, '../outputs/test_data.tfr')

In [None]:
nsl.tools.pack_nbrs('../outputs/train_data.tfr',
                    '',
                    '../outputs/train_graph_60.tsv',
                    '../outputs/nsl_train_data.tfr',
                    add_undirected_edges=True,
                    max_nbrs=3)