# INF554 Kaggle Challenge

By Ombeline Lagé & Haris Sahovic

# Preprocessing

## Package importations & data location

Some of these elements are not used anymore, as they were used for code that has been removed or modified.

In [1]:
import csv
import heapq  # Used for fast top-k element retrieval
import nltk
import gensim
import numpy as np
import os
import pandas
import re
import scipy
import time

from IPython.display import SVG, display  # User for Keras model-graph display
from operator import add

from keras.backend import set_value
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import concatenate as concatenate_layers
from keras.layers import AlphaDropout, Conv1D, Conv2D, Dense, Dropout, Embedding, Flatten, GaussianNoise, GlobalAveragePooling1D, GlobalMaxPool1D, GRU, Input, LSTM, MaxPooling1D
from keras.models import load_model, Model, Sequential
from keras.optimizers import Adadelta, Adagrad, Nadam, RMSprop, SGD
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from typing import List

KeyboardInterrupt: 

## Data location

If running on your computer, please place the following files in a folder named `data`:

- `testing_set.txt`
- `training_set.txt`
- `node_information.csv`

In [None]:
if True:  # Kaggle notebook
    to_predict_location = "../input/testing_set.txt"
    training_location = "../input/training_set.txt"
    node_location = "../input/node_information.csv"
if True:
    to_predict_location = os.path.join("data", "testing_set.txt")
    training_location = os.path.join("data", "training_set.txt")
    node_location = os.path.join("data", "node_information.csv")

## Nodes importation

Importation of raw node data.

Nodes are stored in a `list` named `nodes`.

Each node is a `dict` instance, with entries as follows :
- `id`: node id (`int`)
- `year`: article year of publication (`int`)
- `title`: article title (`str`)
- `authors_raw`: article authors, unprocessed (`str`)
- `journal_raw`: article journal, unprocessed (`str`)
- `abstract`: article abstract, unprocessed (`str`)

We also create `nodes_dict`, a dictionnary allowing access to node by their `id`, ie. given a node `n`, we have `nodes_dict[n['id']] == n`.

In [None]:
nodes = []

with open(node_location) as f:
    data_reader = csv.reader(f)
    for line in data_reader:
        node = {
            'id': int(line[0]),
            'year': int(line[1]),
            'title': line[2],
            'authors_raw': line[3],
            'journal_raw': line[4],
            'abstract': line[5],
        }

        nodes.append(node)

nodes_dict = {
    n['id']: n for n in nodes
}

## Training and to submit data importation

The data is imported in two lists, `training_input` and `to_predict_input`. Each list contains lists with two `int`s, the `id`s of two nodes.

Training output is also imported in `training_output`, a `list` of `int`s (actually, a list of `1`s and `0`s).

In [None]:
training_input = []
training_output = []

to_predict_input = []

with open(training_location) as f:
    for line in f.read().split('\n'):
        line = [int(el) for el in line.split(' ')]
        training_input.append(line[:2])
        training_output.append(line[-1])

with open(to_predict_location) as f:
    for line in f.read().split('\n'):
        line = [int(el) for el in line.split(' ')]
        to_predict_input.append(line[:2])

# Feature engineering

Here is a quick reference to the content of each `node` after feature extraction:
- Raw features
- `id` : node id (`int`)
    - `year`: article's year of publication (`int`)
    - `title`: article's title (`str`)
    - `authors_raw`: article's authors, unprocessed (`str`)
    - `journal_raw`: article's journal, unprocessed (`str`)
    - `abstract`: article's abstract, unprocessed (`str`)
- Journal & authors features
    - `authors`: `list` of authors (`str`) of the article
    - `journal`: article's journal, processed (`str`)
    - `authors_int`: `list` of `int`s representing the article authors (see `author_to_int`)
    - `journal_int`: `int` representing the article's journal (see `journal_to_int`)
    - `authors_one_hot`: one-hot vector (`np.array`) of article's authors
    - `journal_ont_hot`: one-hot vectror (`np.array`) of article's journal
- NLP features
    - `abstract_tfidf`: tfidf vector of the journal's abstract
    - `title_tfidf`: tfidf vector of the journal's title
    - `abstract_svd`: svd of the journal's abstract
    - `title_svd`: svd of the journal's title
    - `abstract_int`: index (`list` of `int`s) representation of the journal's abstract
    - `title_int`: index (`list` of `int`s) representation of the journal's title
    - `authors_svd`: svd of the journal's authors titles
    - `abstract_wv`: array (`np.array`) representing word2vec vectors of the journal's abstract
    - `title_wv`: array (`np.array`) representing word2vec vectors of the journal's title
- Graph features
    - `co_citation`: `dict` containing the number of co-citation with a given nodes
    - see `graph_features` for the rest of them



Other useful objects, used for conversion from `str` to index (`int`) representation.

- `author_to_int`: a `dict` associating an `int`to each author represented by a `str`
- `int_to_author`: a `list` such that `author_to_int[int_to_author[i]] == i`
- `journal_to_int`: a `dict` associating an `int`to each jounral represented by a `str`
- `int_to_journal`: a `list` such that `journal_to_int[int_to_journal[i]] == i`

Other things :
- `author_titles`:  `dict` mapping each author `int` to a list of its article titles
- `words_list`: sorted list of gensim tokens
- `word_to_int`: reverse index of `words_list`
- `embeddings`: matrix using w2v or Glove vectors for Keras embedding

## Author & journal formatting

### Journal & Author indexing

Here, we:

- Extract the list of authors for each node, clean the authors, and add the list as `authors` (`list` of `str`)
- Convert authors to `int`s. We can convert authors to `int`s and `int`s to authors with `author_to_int` (`dict`) and `int_to_author` (`list`)
- Add a `list` of `int`s representing its authors to each node, in an `authors_int` entry.
- Do some text formatting on journals and save the result as `journal` for each node
- Convert journals to `int`s. We can convert journals to `int`s and `int`s to journals with `journal_to_int` (`dict`) and `int_to_journal` (`list`)
- Store the node journal `int` into `journal_int`

In [None]:
author_to_int = {}
int_to_author = []

journal_to_int = {}
int_to_journal = []


def clean_author(author: str) -> str:
    author = re.sub("[\(\[].*?[\)\]]", "", author)
    if ')' in author:
        author = author[author.index(')')+1:]
    if '(' in author:
        author = author[:author.index('(')]
    while author.startswith(' '):
        author = author[1:]
    while author.endswith(' '):
        author = author[:-1]
    author = author.replace('-', ' ')
    author = author.replace(' ', '')
    return author


for n in nodes:
    n['authors'] = [clean_author(aut)
                    for aut in re.split("&|,|;", n['authors_raw'])]
    for author in n['authors']:
        if not author:
            continue
        if author not in author_to_int:
            author_to_int[author] = len(author_to_int)
            int_to_author.append(author)
    n['authors_int'] = [author_to_int[author]
                        for author in n['authors'] if author]
    n['journal'] = n['journal_raw'].replace("'", '').replace('"', '').replace("\\", '').replace(
        "/", '').replace(")", '').replace("(", '').replace("-", '').replace(",", '').lower()
    if n['journal'] not in journal_to_int:
        journal_to_int[n['journal']] = len(journal_to_int)
        int_to_journal.append(n['journal'])
    n['journal_int'] = journal_to_int[n['journal']]


nodes_dict = {
    n['id']: n for n in nodes
}

### Author title

We create `author_titles`, a `dict` mapping each author `int` to a list of its article titles. This can be used to infer topics by authors, later on.

In [None]:
author_titles = {}

for n in nodes:
    for author_int in n['authors_int']:
        if author_int not in author_titles:
            author_titles[author_int] = [n['title']]
        else:
            author_titles[author_int].append(n['title'])

### One-hot author vectors

One-hot vector of article authors, stored as `authors_one_hot`.

In [None]:
for node in nodes:
    auth_vector = np.zeros(len(author_to_int))
    for auth in node['authors_int']:
        auth_vector[auth] = 1
    node['authors_one_hot'] = auth_vector

### One-hot journal vectors

One-hot vector of article journal, stored as `journal_one_hot`.

In [None]:
for node in nodes:
    journal_vector = np.zeros(len(journal_to_int))
    journal_vector[journal_to_int[node['journal']]] = 1
    node['journal_one_hot'] = journal_vector

### Authors PCA

PCA on one-hot vector of authors, with dimension `AUTHOR_PCA_DIM`, stored in `authors_pca`.

The low value of the explained variance indicates that this PCA does not represents very adequatly our data.

In [None]:
AUTHOR_PCA_DIM = 24

author_pca = PCA(AUTHOR_PCA_DIM)

for n, pca in zip(nodes, author_pca.fit_transform([n['authors_one_hot'] for n in nodes])):
    n['authors_pca'] = pca

print('Authors dimension:', len(n['authors_one_hot']))
print("Explained variance: %.3f" % (sum(author_pca.explained_variance_ratio_)))

### Journal PCA

PCA on one-hot vector of journals, with dimension `JOURNAL_PCA_DIM`, stored in `journal_pca`.

In [None]:
JOURNAL_PCA_DIM = 24

journal_pca = PCA(JOURNAL_PCA_DIM)
for node, pca in zip(nodes, journal_pca.fit_transform([n['journal_one_hot'] for n in nodes])):
    node['journal_pca'] = pca

print('Journal dimension:', len(n['journal_one_hot']))
print("Explained variance: %.3f" %
      (sum(journal_pca.explained_variance_ratio_)))

## NLP

### Tokenization

We apply gensim's preprocessing to nodes titles and abstracts. The results are stored in `title_gensim` and `abstract_gensim`. We also build a list of words, `words_list` on the fly.

In [None]:
words_list = set()  # Initialised as a set for quick updating

for n in nodes:
    n['title_gensim'] = [w for w in gensim.utils.simple_preprocess(n['title'], max_len=30)]
    n['abstract_gensim'] = [w for w in gensim.utils.simple_preprocess(n['abstract'], max_len=30)]
    words_list.update(set(n['title_gensim']).union(n['abstract_gensim']))

# Converted and sorted for reproducible results
words_list = sorted(list(words_list))

### Naive approach

First approach. We define a `naive_similarity` function, which returns, given two nodes:
- The percentage of words of a's abstract in b's abstract
- The percentage of words of a's abstract in b's title
- The percentage of words of a's title in b's abstract
- The percentage of words of a's title in b's title
- The percentage of words of b's abstract in a's abstract
- The percentage of words of b's abstract in a's title
- The percentage of words of b's title in a's abstract
- The percentage of words of b's title in a's title

In [None]:
def naive_similarity(node_a, node_b):
    sta = set(node_a['title_gensim'])
    saa = set(node_a['abstract_gensim'])
    sab = set(node_b['abstract_gensim'])

    tt = len((sta).intersection(node_b['title_gensim']))
    ta = len((sab).intersection(sta))
    at = len((saa).intersection(node_b['title_gensim']))
    aa = len((saa).intersection(sab))

    lta = len(node_a['title_gensim'])
    ltb = len(node_b['title_gensim'])
    laa = len(node_a['abstract_gensim'])
    lab = len(node_b['abstract_gensim'])

    return (tt / lta if tt else 0,
            tt / ltb if tt else 0,
            ta / lta if ta else 0,
            ta / lab if ta else 0,
            at / laa if at else 0,
            at / ltb if at else 0,
            aa / laa if aa else 0,
            aa / lab if aa else 0)

### TFIDF and LSA

We use `scikit-learn`'s `TfidfVectorizer` to compute tfidf vectors of nodes abstracts and titles. We use a `ngram_range` of 3.

#### TFIDF

To each node, we add respectively:
- `abstract_tfidf`: tfidf vector of the journal's abstract
- `title_tfidf`: tfidf vector of the journal's title

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))

tfidf_vectors = tfidf_vectorizer.fit_transform(
    [n['abstract'] for n in nodes] + [n['title'] for n in nodes]
)

for n, a_tfidf, t_tfidf in zip(nodes, tfidf_vectors[:len(nodes)], tfidf_vectors[len(nodes):]):
    n['abstract_tfidf'] = a_tfidf
    n['title_tfidf'] = t_tfidf

#### SVD

Now that we have tfidf vectors, we apply a truncated SVD to get LSA vectors of dimension `SVD_DIM`. The resulting vectors are added to the nodes as:

- `abstract_svd`
- `title_svd`

In [None]:
SVD_DIM = 128

svd_model = TruncatedSVD(n_components=SVD_DIM)
svd_matrix = svd_model.fit_transform(tfidf_vectors)

for n, vec in zip(nodes, svd_matrix[:len(nodes)]):
    n['abstract_svd'] = vec
for n, vec in zip(nodes, svd_matrix[len(nodes):]):
    n['title_svd'] = vec

In [None]:
del tfidf_vectors, svd_matrix

#### Authors titles' SVD

For each node, we group it's authors articles titles' and apply our LSA. The result is added as `authors_svd`.

For each author, we group the titles of their articles

Même procédé pour chaque auteur, où l'on agrège titres et abstracts par auteur.

In [None]:
if False:
    for node, vec in zip(nodes, svd_model.transform(
        tfidf_vectorizer.transform([
            '\n'.join([
                title
                for aut in n['authors_int']
                for title in author_titles[aut]
            ])
            for n in nodes
        ])
    )):
        node['authors_svd'] = vec

#### Common words SVD

Given two nodes, `common_words_svd` computes the LSA representation of their common words.

In [None]:
def common_words_svd(node_a, node_b):
    a = set(node_a['abstract_gensim']).union(node_a['title_gensim'])
    b = set(node_b['abstract_gensim']).union(node_b['title_gensim'])
    return svd_model.transform(
        tfidf_vectorizer.transform([' '.join(a.intersection(b))])
    )[0]

### Word2vec

#### Training / Loading

Training or loading of the w2v model.
If `TRAIN_W2V` is set to `True`, the model will be trained on the titles / abstract corpus, and produce vectors of dimension `W2V_VEC_DIM`. Furthermore, they will be saved in `trained_w2v.model`.

Otherwise, a model will be loaded. If `LOAD_GOOGLE` is `True`, Google's embeddings are going to be used. If not, the last trained model will be used.

In [None]:
TRAIN_W2V = True
LOAD_GOOGLE = False
W2V_FILE = "trained_w2v.model"
W2V_VEC_DIM = 128

if TRAIN_W2V:
    w2v_training_set = [t
                        for n in nodes
                        for t in [n['title_gensim'], n['abstract_gensim']]
                        ]
    w2v_model = gensim.models.Word2Vec(
        w2v_training_set,
        size=W2V_VEC_DIM,
        window=7,
        min_count=0,
        workers=120
    )
    w2v_model.train(
        w2v_training_set,
        total_examples=len(w2v_training_set),
        epochs=120
    )
    w2v_model.wv.save_word2vec_format(W2V_FILE)
    del w2v_training_set
else:
    if LOAD_GOOGLE:
        W2V_VEC_DIM = 300
        w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
            '/Users/ombeline/Desktop/GoogleNews-vectors-negative300.bin', # Here, the location of google's w2v vectors
            binary=True
        )
    else:  # Last trained model
        w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
            W2V_FILE)

#### Node w2v vectors

Now that we have a word2vec model to work with, we can use it to compute word2vec representation of our textual features, namely titles and abstracts.

In order to do this, we need first to compute the maximum length of our vectors, so that they can have a fixed size.

Max sizes are stored in `LEN_TITLE` and `LEN_ABSTRACT`; node word2vec vectors are stored into `title_wv` and `abstract_wv` arrays for each node.

Storing w2v vectors for each node is expensive, memory wise. If this feature is not used, its storage can be turned off by setting `USE_W2V_NODES` to `False`.

In [None]:
USE_W2V_NODES = False

LEN_TITLE = max([
    len((n['title_gensim']))
    for n in nodes
])
LEN_ABSTRACT = max([
    len((n['abstract_gensim']))
    for n in nodes
])

if USE_W2V_NODES:
    for n in nodes: # Switch to keras' pad_sequence ?
        n['title_wv'] = np.zeros((LEN_TITLE, W2V_VEC_DIM))
        n['abstract_wv'] = np.zeros((LEN_ABSTRACT, W2V_VEC_DIM))

        for i, w in enumerate(n['title_gensim']):
            n['title_wv'][i] = w2v_model[w] if w in w2v_model else np.zeros(
                VEC_DIM)
        for i, w in enumerate(n['abstract_gensim']):
            n['abstract_wv'][i] = w2v_model[w] if w in w2v_model else np.zeros(
                VEC_DIM)

#### Word embeddings

We also define embeddings, to be used by Keras.

We use `words_list` to build an `embeddings` matrix, and inject our word2vec representation as vectors. We keep `0` as a special value for empty entries. We also build a `word_to_int` dictionnary.

In [None]:
word_to_int = {w: i+1 for i, w in enumerate(words_list)}
embeddings = np.zeros((len(word_to_int) + 1, W2V_VEC_DIM))

for w, i in word_to_int.items():
    if w in w2v_model:
        embeddings[i] = w2v_model[w]
    else:
        embeddings[i] = np.random.normal(W2V_VEC_DIM)

#### Lists of word indexes

To use this embedding, we need corresponding representations of our sentences, as `list`s of index `int`s. They are stored as `title_int` and `abstract_int`.


In [None]:
for n in nodes:
    n['title_int'] = pad_sequences([[word_to_int[w] for w in n['title_gensim']]], maxlen=LEN_TITLE)[0]
    n['abstract_int'] = pad_sequences([[word_to_int[w] for w in n['abstract_gensim']]], maxlen=LEN_ABSTRACT)[0]

#### Common words w2v

A first application of w2v can be to retain the common words of two phrases, and compute their w2v representation.

That is the goal of the function `w2v_common_words`. The function:
- selects common words
- if there are less than 5, we add the word 'the' as much as necessay, effectively acting as a 0 padding
- if there is more than 5, we only keep the 5 rarest words
- return value is the concatenation of the w2v vectors of the remaining words

For efficiency, we precompute `tfidf_voc` and `tfidf_dic`.

In [None]:
tfidf_voc = set(tfidf_vectorizer.vocabulary_.keys())
tfidf_dic = {w: i for w,i in tfidf_vectorizer.vocabulary_.items()}
lookup = list(tfidf_vectorizer.idf_)
tfidf_dic = {w: lookup[i] for w,i in tfidf_dic.items()}

def w2v_common_words(node_a, node_b, n_words=5):
    common_words = set(
        node_a['abstract_gensim'] + node_a['title_gensim']
    ).intersection(
        node_b['title_gensim'] + node_b['abstract_gensim']
    ).intersection(tfidf_voc)

    if len(common_words) < n_words:
        common_words = list(common_words) + \
            (['the'] * (n_words - len(common_words)))
    else:
        common_words = heapq.nlargest(
            n_words, common_words, key=lambda w: tfidf_dic[w])
    return [el for w in common_words for el in w2v_model[w]]

### Glove embedding

Alternative work embedding.

In [None]:
USE_GLOVE_EMBEDDING = False

if USE_GLOVE_EMBEDDING:
    location = "glove.6B.300d.txt" # to be changed to your location
    glove_embedding = {}
    with open(location) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            glove_embedding[word] = coefs
    W2V_VEC_DIM = 300
    embeddings = np.zeros((len(word_to_int) + 1, W2V_VEC_DIM))
    for w, i in word_to_int.items():
        if w in glove_embedding:
            embeddings[i] = glove_embedding[w]
        else:
            embeddings[i] = np.zeros(W2V_VEC_DIM)

## Graph features

### Author cooperation graph

We build a graph of author cooperation, and build a `class` to compute distances based on this distance. Two authors that have worked together have a distance of `0`; two authors that have a common co-author have a distance of `1`, and so on.

We then instanciate this `AuthorCooperationDistance` class in `author_distance`, with our graph. This instance is callable as a function, and does what you expect it to do.

In [None]:
author_neighbours = {a: set() for a in author_to_int.values()}
for node in nodes:
    for aut in node['authors_int']:
        author_neighbours[aut].update(node['authors_int'])


class AuthorCooperationDistance():
    """
    This class is used as a lazy distance matrix.
    """

    def __init__(self, neighbours: List[int]) -> None:
        self.n = neighbours
        self.d = {}  # Computed distances

    def __call__(self, a: int, b: int) -> int:
        if a in self.d:
            pass
        elif b in self.d:
            if a in self.d[b]:
                if a not in self.d:
                    self.d[a] = {}
                self.d[a][b] = self.d[b][a]
            else:
                if a not in self.d:
                    self.d[a] = {}
                self.d[a][b] = self.d[b][a] = None
        else:
            self.compute_distance(a)
        if b not in self.d[a]:
            self.d[a][b] = None
        return self.d[a][b]

    def compute_distance(self, a: int) -> None:
        """Computes distances of authors linked to a."""
        if a not in self.d:
            self.d[a] = {}

        to_explore = self.n[a]
        explored = set()
        i = 0

        while to_explore:
            to_add = set()
            for element in to_explore:
                if element in explored:
                    continue
                self.d[a][element] = i
                to_add.update(self.n[element])
            explored.update(to_explore)
            to_explore = to_add
            i += 1


author_distance = AuthorCooperationDistance(author_neighbours)

### Distance of author groups

We are looking for a featuring measuring the distance between two groups of authors. We start by computing the cooperation distances pairwise, group to group. We then extract five features:

- First group's size
- Second group's size
- Mean of distances with a value
- Min distance
- The proportion of unconnected pairs

In [None]:
def author_set_comparison(aut_a, aut_b):
    n_none = 0
    mean = 0
    min_d = None
    for a in aut_a:
        for b in aut_b:
            d = (author_distance(a, b))
            if d is None:
                n_none += 1
            else:
                mean += d
                if min_d is None or min_d > d:
                    min_d = d
    return np.array([
        len(aut_a),
        len(aut_b),
        mean / max((len(aut_a) * len(aut_b)) - n_none, 1),
        n_none/max((len(aut_a) * len(aut_b)),1),
        min_d if min_d else 5
    ])

### Nodes arity

We compute, for each journal and authors, the number of incoming and outcoming citations, and build a citation graph.

Furthermore, for each article, we compute the number of incoming citations.

In [None]:
j_citations = {}
a_citations = {}

j_citations = {}
j_out = {}
a_out = {}

j_in = {}
a_in = {}
n_in = {}

max_a_citations = 0
min_a_citations = 64

cites = {n['id']: set() for n in nodes} # ->

id_to_int = {n['id']:i for i, n in enumerate(nodes)}

for inp, out in zip(training_input, training_output):
    if out == 1:
        node_a = nodes_dict[inp[0]]
        node_b = nodes_dict[inp[1]]
        n_in[inp[1]] = n_in.get(inp[1], -1) + 1

        cites[inp[0]].add(inp[1])        
        
        aut_a = node_a['authors_int']
        aut_b = node_b['authors_int']
        j_a = journal_to_int[node_a['journal']]
        j_b = journal_to_int[node_b['journal']]

        max_a_citations = max(max_a_citations, len(aut_a) * len(aut_b))
        min_a_citations = min(min_a_citations, len(aut_a) * len(aut_b))

        if j_a not in j_citations:
            j_citations[j_a] = {}

        if j_b not in j_citations[j_a]:
            j_citations[j_a][j_b] = 0
        else:
            j_citations[j_a][j_b] += 1

        j_out[j_a] = j_out.get(j_a, -1) + 1
        j_in[j_b] = j_in.get(j_b, -1) + 1
        for a in aut_a:
            for b in aut_b:
                if a not in a_citations:
                    a_citations[a] = {}
                if b not in a_citations[a]:
                    a_citations[a][b] = 0
                else:
                    a_citations[a][b] += 1
                a_out[a] = a_out.get(a, -1) + 1
                a_in[b] = a_in.get(b, -1) + 1

j_distance = np.zeros((len(int_to_journal), len(int_to_journal)))
for i in range(len(int_to_journal)):
    if i not in j_citations:
        continue
    for j, val in j_citations[i].items():
        j_distance[i][j] = val/j_out[i] if j_out[i] else 0

a_distance = np.zeros((len(author_to_int), len(author_to_int)))
for i in range(len(author_to_int)):
    if i not in a_citations:
        continue
    for j, val in a_citations[i].items():
        a_distance[i][j] = val/a_out[i] if a_out[i] else 0

for n in nodes:
    n['popularity'] = n_in.get(n['id'], 0)

### Adjacency matrices

We build direct, reversed and undirected adjency matrices for articles, authors and journals.

Then, we compute their squares, cube and hypercube.

In [None]:
a_cites = [(aa, ab) 
           for a, set_b in cites.items() 
           for b in set_b 
           for aa in nodes_dict[a]['authors_int'] 
           for ab in nodes_dict[b]['authors_int']
          ]
j_cites = [(nodes_dict[a]['journal_int'], nodes_dict[b]['journal_int']) 
           for a, set_b in cites.items() 
           for b in set_b 
          ]
n_cites = [(id_to_int[a], id_to_int[b]) for a, set_b in cites.items() for b in set_b]

node_d_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in n_cites], 
     ([a for a, b in n_cites], 
      [b for a, b in n_cites])
    ), 
    shape = (len(nodes), len(nodes))
).tocsr()

node_r_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in n_cites], 
     ([b for a, b in n_cites], 
      [a for a, b in n_cites])
    ), 
    shape = (len(nodes), len(nodes))
).tocsr()

node_u_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in n_cites] + [1 for _ in n_cites], 
     ([a for a, b in n_cites] + [b for a, b in n_cites], 
      [b for a, b in n_cites] + [a for a, b in n_cites])
    ), 
    shape = (len(nodes), len(nodes))
).tocsr()

author_d_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in a_cites], 
     ([a for a, b in a_cites], 
      [b for a, b in a_cites])
    ), 
    shape = (len(int_to_author), len(int_to_author))
).tocsr()

author_r_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in a_cites], 
     ([b for a, b in a_cites], 
      [a for a, b in a_cites])
    ), 
    shape = (len(int_to_author), len(int_to_author))
).tocsr()

author_u_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in a_cites] + [1 for _ in a_cites], 
     ([a for a, b in a_cites] + [b for a, b in a_cites], 
      [b for a, b in a_cites] + [a for a, b in a_cites])
    ), 
    shape = (len(int_to_author), len(int_to_author))
).tocsr()

journal_d_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in j_cites], 
     ([a for a, b in j_cites], 
      [b for a, b in j_cites])
    ), 
    shape = (len(int_to_journal), len(int_to_journal))
).tocsr()

journal_r_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in j_cites], 
     ([b for a, b in j_cites], 
      [a for a, b in j_cites])
    ), 
    shape = (len(int_to_journal), len(int_to_journal))
).tocsr()

journal_u_adjacence = scipy.sparse.coo_matrix(
    ([1 for _ in j_cites] + [1 for _ in j_cites], 
     ([a for a, b in j_cites] + [b for a, b in j_cites], 
      [b for a, b in j_cites] + [a for a, b in j_cites])
    ), 
    shape = (len(int_to_journal), len(int_to_journal))
).tocsr()

In [None]:
print('Nodes direct adjacence matrix of order 2')
node_d_adjacence_2 = node_d_adjacence ** 2
print('Nodes direct adjacence matrix of order 3')
node_d_adjacence_3 = node_d_adjacence_2 * node_d_adjacence_2
print('Nodes direct adjacence matrix of order 4')
node_d_adjacence_4 = node_d_adjacence_3 * node_d_adjacence_2
print('Nodes reversed adjacence matrix of order 2')
node_r_adjacence_2 = node_r_adjacence ** 2
print('Nodes reversed adjacence matrix of order 3')
node_r_adjacence_3 = node_r_adjacence_2 * node_r_adjacence_2
print('Nodes reversed adjacence matrix of order 4')
node_r_adjacence_4 = node_r_adjacence_3 * node_r_adjacence_2
print('Nodes undirected adjacence matrix of order 2')
node_u_adjacence_2 = node_u_adjacence ** 2
print('Nodes undirected adjacence matrix of order 3')
node_u_adjacence_3 = node_u_adjacence_2 * node_u_adjacence_2
print('Nodes undirected adjacence matrix of order 4')
node_u_adjacence_4 = node_u_adjacence_3 * node_d_adjacence_2
print('Author direct adjacence matrix of order 2')
author_d_adjacence_2 = author_d_adjacence ** 2
print('Author direct adjacence matrix of order 2')
author_r_adjacence_2 = author_u_adjacence ** 2
print('Author undirected adjacence matrix of order 2')
author_u_adjacence_2 = author_u_adjacence ** 2
print('Author direct adjacence matrix of order 3')
author_d_adjacence_3 = author_d_adjacence_2 * author_d_adjacence
print('Author direct adjacence matrix of order 3')
author_r_adjacence_3 = author_u_adjacence_2  * author_r_adjacence
print('Author undirected adjacence matrix of order 3')
author_u_adjacence_3 = author_u_adjacence_2 * author_u_adjacence
print('Author direct adjacence matrix of order 4')
author_d_adjacence_4 = author_d_adjacence_3 * author_d_adjacence
print('Author direct adjacence matrix of order 4')
author_r_adjacence_4 = author_u_adjacence_3  * author_r_adjacence
print('Author undirected adjacence matrix of order 4')
author_u_adjacence_4 = author_u_adjacence_3  * author_u_adjacence
print('Journal direct adjacence matrix of order 2')
journal_d_adjacence_2 = journal_d_adjacence ** 2
print('Journal reversed adjacence matrix of order 2')
journal_r_adjacence_2 = journal_u_adjacence ** 2
print('Journal undirected adjacence matrix of order 2')
journal_u_adjacence_2 = journal_u_adjacence ** 2

### Co-citation

We say two nodes are co-cited if they are cited by the same node.

In [None]:
for n in nodes:
    n['co_citation'] = dict()

for cited in cites.values():
    for a in cited:
        for b in cited:
            nodes_dict[a]['co_citation'][b] = nodes_dict[a]['co_citation'].get(b, 0) + 1

### Node vectorisation

We try to apply lsa directly on nodes.

In [None]:
NODE_VEC_DIM = 128
print()
node_vectorizer = TfidfVectorizer()
node_tfidf = node_vectorizer.fit_transform(
    [' '.join([str(u) for u in v]) for v in cites.values()]
)

node_svd = TruncatedSVD(n_components=NODE_VEC_DIM).fit(node_tfidf)

def vectorize_nodes(node_a, node_b):
    ida = node_a['id']
    idb = node_b['id']
    sa = cites[ida]
    sb = cites[idb]

    return node_svd.transform(
            node_vectorizer.transform(
                [' '.join([str(u) for u in v if u not in [ida, idb]]) for v in [sa, sb]]
            )
        ).reshape(2 * NODE_VEC_DIM)

### Graph features function

The information we extracted above is not directly linked to a node; we therefore define a function returning relevant features. Given two nodes `node_a` and `node_b`, the returned features are:

- Boolean indicating if `node_a` has a journal
- Boolean indicating if `node_a` has a journal
- Distance going from `journal_a` to `journal_b`
- Distance going from `journal_b` to `journal_a`
- Minimum 4 citation distance from `node_a`'s authors to `node_b`'s
- Minimum 4 citation distance from `node_b`'s authors to `node_a`'s
- Number of citation from `node_a`'s journal
- Number of citation to `node_a`'s journal
- Number of citation to `node_a`
- Number of citation from `node_b`'s journal
- Number of citation to `node_b`'s journal
- Number of citation to `node_b`
- Adjaceny matrices entries for our two nodes

In [None]:
def graph_features(node_a, node_b):
    return [
        1 if node_a['journal'] != '' else 0,
        1 if node_b['journal'] != '' else 0,
        node_a['co_citation'].get(node_b['id'], 0),
        j_distance[journal_to_int[node_a['journal']]
                   ][journal_to_int[node_b['journal']]],
        j_distance[journal_to_int[node_b['journal']]
                   ][journal_to_int[node_a['journal']]],
        j_out.get(node_a['journal'], 0),
        j_in.get(node_a['journal'], 0),
        n_in.get(node_a['journal'], 0),
        j_out.get(node_b['journal'], 0),
        j_in.get(node_b['journal'], 0),
        n_in.get(node_b['journal'], 0),
        node_d_adjacence_2[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        node_d_adjacence_3[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        node_d_adjacence_4[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        node_r_adjacence_2[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        node_r_adjacence_3[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        node_r_adjacence_4[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        node_u_adjacence_2[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        node_u_adjacence_3[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        node_u_adjacence_4[id_to_int[node_a['id']], id_to_int[node_b['id']]],
        sum([author_d_adjacence_2[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_r_adjacence_2[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_u_adjacence_2[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_d_adjacence_3[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_r_adjacence_3[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_u_adjacence_3[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_d_adjacence_4[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_r_adjacence_4[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_u_adjacence_4[a, b] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        journal_d_adjacence_2[node_a['journal_int'], node_b['journal_int']],
        journal_r_adjacence_2[node_a['journal_int'], node_b['journal_int']],
        journal_u_adjacence_2[node_a['journal_int'], node_b['journal_int']],
        node_d_adjacence_2[id_to_int[node_b['id']], id_to_int[node_a['id']]],
        node_d_adjacence_3[id_to_int[node_b['id']], id_to_int[node_a['id']]],
        node_d_adjacence_4[id_to_int[node_b['id']], id_to_int[node_a['id']]],
        node_r_adjacence_2[id_to_int[node_b['id']], id_to_int[node_a['id']]],
        node_r_adjacence_3[id_to_int[node_b['id']], id_to_int[node_a['id']]],
        node_r_adjacence_4[id_to_int[node_b['id']], id_to_int[node_a['id']]],
        sum([author_d_adjacence_2[b, a] for a in node_b['authors_int'] for b in node_a['authors_int']]),
        sum([author_r_adjacence_2[b, a] for a in node_b['authors_int'] for b in node_a['authors_int']]),
        sum([author_d_adjacence_3[b, a] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_r_adjacence_3[b, a] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_d_adjacence_4[b, a] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        sum([author_r_adjacence_4[b, a] for a in node_a['authors_int'] for b in node_b['authors_int']]),
        journal_d_adjacence_2[node_b['journal_int'], node_a['journal_int']],
        journal_r_adjacence_2[node_b['journal_int'], node_a['journal_int']],
    ] + heapq.nsmallest(4,
                        [a_distance[i][j]
                         for i in node_a['authors_int']
                         for j in node_b['authors_int']
                         ] + [0, 0, 0, 0]
                        )\
    + heapq.nsmallest(4,
                        [a_distance[j][i]
                         for i in node_a['authors_int']
                         for j in node_b['authors_int']
                         ] + [0, 0, 0, 0]
                        )

# Models

## Utilities

Three functions:

- `entry`: takes two nodes as an argument, returns layer-ready features
- `inputify`: creates a keras-ready matrix with the selected features
- `thresholdify`: given predictions and values, compute an approximate of the optimal threshold for the f1_score

We also set our `TRAINING_TEST_RATIO`.

In [None]:
TRAINING_TEST_RATIO = .9

def entry(node_a, node_b):
    return np.concatenate([
        [node_a['year'],
         node_b['year'],
         np.dot(node_a['title_svd'],node_b['title_svd']),
         np.dot(node_a['abstract_svd'],node_b['title_svd']),
         np.dot(node_a['title_svd'],node_b['abstract_svd']),
         np.dot(node_a['abstract_svd'],node_b['abstract_svd']),
         (node_a['title_tfidf'] * node_b['title_tfidf'].T).A[0][0],
         (node_a['abstract_tfidf'] * node_b['abstract_tfidf'].T).A[0][0],
         (node_a['title_tfidf'] * node_b['abstract_tfidf'].T).A[0][0],
         (node_a['abstract_tfidf'] * node_b['title_tfidf'].T).A[0][0]],
        graph_features(node_a, node_b),
        node_a['journal_pca'],
        node_b['journal_pca'],
        author_set_comparison(node_a['authors_int'], node_b['authors_int']),
        naive_similarity(node_a, node_b),
        node_a['title_svd'],
        node_b['title_svd'],
        node_a['abstract_svd'],
        node_b['abstract_svd'],
        vectorize_nodes(node_a, node_b)]
    )

def inputify(source: List[List]):
    start = time.time()
    features = np.empty((len(source), len(
        list(entry(nodes_dict[source[0][0]], nodes_dict[source[0][0]])))))
    print(':%s:\n:' % ('.' * 101), end='')
    for i, el in enumerate(source):
        if i % int(len(source) / 100) == 0:
            print('.', end='')
        node_a = nodes_dict[el[0]]
        node_b = nodes_dict[el[1]]
        features[i] = entry(node_a, node_b)
    print(':\nFinished inputifying in %f secondes.' % (time.time() - start))
    return features

def thresholdify(predictions, y, a: float = 0.01, b: float = .99, partitions=8, depth=4) -> float:
    for _ in range(depth):
        t = a
        best_f = 0
        best_t = None
        epsilon = (b-a) / partitions

        while t <= b:
            f = f1_score([int(el + t) for el in predictions], y)
            if f > best_f:
                best_f = f
                best_t = t
            t += epsilon
        a = a if best_t == a else best_t - epsilon
        b = b if best_t == b else best_t + epsilon
    return (a+b) / 2

## Common features definition

If every feature is used, this can be long.

In [None]:
COMPUTE_FEATURES = True
SAVE_FEATURES = True

if COMPUTE_FEATURES:
    normaliser = StandardScaler()
    (training_set,
     testing_set,
     training_set_output,
     testing_set_output,
     training_indexes,
     testing_indexes) = train_test_split(
        normaliser.fit_transform(
            inputify(training_input)
        ),
        training_output,
        training_input,
        train_size=TRAINING_TEST_RATIO
    )
    
    output_set = normaliser.transform(inputify(to_predict_input))
    if SAVE_FEATURES:
        np.save('os.np', output_set)
        np.save('trs.np',training_set)
        np.save('tes.np',testing_set)
        np.save('tso.np',training_set_output)
        np.save('teo.np',testing_set_output)
        np.save('tri.np',training_indexes)
        np.save('tei.np',testing_indexes)
else:
    training_set = np.load('trs.np.npy')
    testing_set = np.load('tes.np.npy')
    training_set_output = np.load('tso.np.npy')
    testing_set_output = np.load('teo.np.npy')
    training_indexes = np.load('tri.np.npy')
    testing_indexes = np.load('tei.np.npy')
    output_set = np.load('os.np.npy')

## Sklearn predictions

We create sklearn classifiers.

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

SIZE = len(training_indexes)

t = time.time()
l_svc = LinearSVC()
l_svc.fit(training_set[:SIZE], training_set_output[:SIZE])
print("Linear SVC accuracy: %f in %fs" % (l_svc.score(testing_set, testing_set_output), time.time() - t))

t = time.time()
sgd = SGDClassifier()
sgd.fit(training_set[:SIZE], training_set_output[:SIZE])
print("SGD accuracy: %f in %fs" % (sgd.score(testing_set, testing_set_output), time.time() - t))

t = time.time()
forest = RandomForestClassifier()
forest.fit(training_set[:SIZE], training_set_output[:SIZE])
print("Random forest accuracy: %f in %fs" % (forest.score(testing_set, testing_set_output), time.time() - t))

t = time.time()
boost = AdaBoostClassifier()
boost.fit(training_set[:SIZE], training_set_output[:SIZE])
print("AdaBoost accuracy: %f in %fs" % (boost.score(testing_set, testing_set_output), time.time() - t))


t = time.time()
gnb = GaussianNB()
gnb.fit(training_set[:SIZE], training_set_output[:SIZE])
print("Gaussian NB accuracy: %f in %fs" % (gnb.score(testing_set, testing_set_output), time.time() - t))

In [None]:
t = time.time()
forest_2 = RandomForestClassifier(30)
forest_2.fit(training_set[:SIZE], training_set_output[:SIZE])
print("Random forest accuracy: %f in %fs" % (forest_2.score(testing_set, testing_set_output), time.time() - t))


In [None]:
t = time.time()
forest_3 = RandomForestClassifier(100)
forest_3.fit(training_set[:SIZE], training_set_output[:SIZE])
print("Random forest accuracy: %f in %fs" % (forest_3.score(testing_set, testing_set_output), time.time() - t))


In [None]:
pred = forest_3.predict_proba(testing_set)
t = thresholdify(pred[:,1], testing_set_output)
pred = [int(t+el[1]) for el in pred]
f1_score(pred, testing_set_output)

In [None]:
t = time.time()
forest_4 = RandomForestClassifier(400)
forest_4.fit(training_set[:SIZE], training_set_output[:SIZE])
print("Random forest accuracy: %f in %fs" % (forest_4.score(testing_set, testing_set_output), time.time() - t))

pred = forest_4.predict_proba(testing_set)
t = thresholdify(pred[:,1], testing_set_output)
pred = [int(t+el[1]) for el in pred]
f1_score(pred, testing_set_output)

In [None]:
o_pred = [int(t+el[1]) for el in forest_3.predict_proba(output_set)]

with open("out.csv", "w+") as f:
    f.write("id,category\n")
    for i, v in enumerate([int(el + t) for el in o_pred]):
        f.write("%d,%d\n" % (i, v))

## Keras Composite Linear model

### Utilities

In [None]:
LEN_AUTHORS = max([len(n['authors_int']) for n in nodes])

def lm_content_generator(source, other_features, output, infinite=True):
    while True:
        for el, f, o in zip(source, other_features, output):
            node_a, node_b = nodes_dict[el[0]], nodes_dict[el[1]]
            yield (node_a['authors_int'],
                   node_b['authors_int'],
                   node_a['journal_int'],
                   node_b['journal_int'],
                   f,
                   o
                   )
        if not infinite:
            break


def lm_batchify(source, other_features, output, batch_size):
    generator = lm_content_generator(source, other_features, output)
    try:
        while True:
            inp = {'authors_1': np.full((batch_size, LEN_AUTHORS), len(author_to_int)),
                   'authors_2': np.full((batch_size, LEN_AUTHORS), len(author_to_int)),
                   'journal_1': np.empty((batch_size,)),
                   'journal_2': np.empty((batch_size,)),
                   'other_features': np.empty((batch_size, len(other_features[0]))),
                   }
            out = np.empty((batch_size))
            for i in range(batch_size):
                el = next(generator)
                inp['authors_1'][i][:len(el[0])] = el[0]
                inp['authors_2'][i][:len(el[1])] = el[1]
                inp['journal_1'][i] = el[2]
                inp['journal_2'][i] = el[3]
                inp['other_features'][i] = el[4]
                out[i] = el[5]
            yield inp, (out)
    except StopIteration:
        yield inp, (out)

### Model definition

In [None]:
NOISE = 0
DROPOUT = .4
JOURNAL_DIM = 16
AUTHORS_DIM = 6

authors_1 = Input(shape=(LEN_AUTHORS,), name='authors_1')
authors_2 = Input(shape=(LEN_AUTHORS,), name='authors_2')

journal_1 = Input(shape=(1,), name='journal_1')  # Abstract 1
journal_2 = Input(shape=(1,), name='journal_2')  # Abstract 2

embedding_authors = Embedding(input_dim=len(author_to_int) + 1,
                        output_dim=AUTHORS_DIM,
                        input_length=LEN_AUTHORS,
                        trainable=True,
                        )

embedding_journal = Embedding(input_dim=len(journal_to_int),
                        output_dim=JOURNAL_DIM,
                        input_length=1,
                        trainable=True,
                        )

journal_1_output = Flatten()(embedding_journal(journal_1))
journal_2_output = Flatten()(embedding_journal(journal_2))

authors_1_output = LSTM(AUTHORS_DIM)(embedding_authors(authors_1))
authors_2_output = LSTM(AUTHORS_DIM)(embedding_authors(authors_2))

other_features = Input(shape=(len(training_set[0]),), name='other_features')

dense_input = concatenate_layers([authors_1_output, authors_2_output, journal_1_output, journal_2_output, other_features])
dense_1 = Dropout(DROPOUT)(Dense(256, activation='elu')(dense_input))
dense_2 = Dropout(DROPOUT)(Dense(128, activation='elu')(dense_1))
output = Dense(1, activation='sigmoid', name="output")(dense_2)

model = Model(inputs=[authors_1, authors_2, journal_1, journal_2, other_features], outputs=(output))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy']
              )

display(SVG(model_to_dot(model).create(prog='dot', format='svg')))

### Model Training

In [None]:
BATCH_SIZE = 320
N_EPOCHS = 40

early_stopping = EarlyStopping(monitor='val_acc',
                               patience=3,
                               mode='max',
                               verbose=1)

checkpointer = ModelCheckpoint(filepath="dense-{val_acc:.4f}.hdf5",
                               monitor='val_acc',
                               save_best_only=True,
                               verbose=1)

reduce_lr = ReduceLROnPlateau(monitor='val_acc',
                              factor=0.33,
                              patience=0,
                              cooldown=1,
                              verbose=True)

model.fit_generator(
    lm_batchify(training_indexes, training_set,
                training_set_output, BATCH_SIZE),
    epochs=N_EPOCHS,
    steps_per_epoch=int(len(training_set)/BATCH_SIZE)+1,
    validation_data=lm_batchify(
        testing_indexes, testing_set, testing_set_output, BATCH_SIZE),
    validation_steps=int(len(testing_set)/BATCH_SIZE)+1,
    callbacks=[checkpointer, reduce_lr, early_stopping],
    use_multiprocessing=False
)

In [None]:
threshold = thresholdify(
        model.predict_generator(
            lm_batchify(
                testing_indexes, 
                testing_set, 
                testing_set_output, 
                len(testing_indexes)
            ), steps = 1), 
        testing_set_output
    )

augmented_testing_set = []

augmented_output_set = []

for el in zip(
    #boost.predict_proba(testing_set),
    #forest.predict_proba(testing_set),
    #forest_2.predict_proba(testing_set),
    forest_3.predict_proba(testing_set),
    forest_4.predict_proba(testing_set),
    #l_svc.predict(testing_set),
    #sgd.predict_proba(testing_set),
    #gnb.predict(testing_set),
    [el for el in model.predict_generator(
        lm_batchify(
           testing_indexes, 
           testing_set, [0 for _ in testing_indexes], 
           len(testing_indexes)
       ), 
   steps = 1
    )]):
    augmented_testing_set.append(sum([l[-1] for l in el])/len(el))
        
t = thresholdify(augmented_testing_set, testing_set_output)
print(f1_score([int(el + t) for el in augmented_testing_set], testing_set_output))

for el in zip(
    #boost.predict_proba(output_set),
    #forest.predict_proba(output_set),
    # forest_2.predict_proba(output_set),
    forest_3.predict_proba(output_set),
    forest_4.predict_proba(output_set),
    #l_svc.predict(output_set),
    #sgd.predict(output_set),
    #gnb.predict_proba(output_set),
    [el for el in model.predict_generator(
        lm_batchify(
           to_predict_input, 
           output_set, [0 for _ in output_set], 
           len(to_predict_input)
       ), 
   steps = 1
    )]
                            ):
    augmented_output_set.append(sum([l[-1] for l in el])/len(el))


with open("out.csv", "w+") as f:
    f.write("id,category\n")
    for i, v in enumerate([int(el + t) for el in augmented_output_set]):
        f.write("%d,%d\n" % (i, v))

## Final model - output

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

augmented_testing_set = []
augmented_output_set = []

for el in zip(
    #boost.predict_proba(testing_set),
    #forest.predict_proba(testing_set),
    #forest_2.predict_proba(testing_set),
    forest_3.predict_proba(testing_set),
    forest_4.predict_proba(testing_set),
    #l_svc.predict(testing_set),
    #sgd.predict_proba(testing_set),
    #gnb.predict(testing_set),
    [el for el in model.predict_generator(
        lm_batchify(
           testing_indexes, 
           testing_set, [0 for _ in testing_indexes], 
           len(testing_indexes)
       ), 
   steps = 1
    )]):
    augmented_testing_set.append(sum([l[-1] for l in el])/len(el))

t = thresholdify(augmented_testing_set, testing_set_output)
print(f1_score([int(el+t) for el in augmented_testing_set], testing_set_output))

for el in zip(
    #boost.predict_proba(output_set),
                        #forest.predict_proba(output_set),
                         # forest_2.predict_proba(output_set),
   forest_3.predict_proba(output_set),
    forest_4.predict_proba(output_set),
                        #l_svc.predict(output_set),
                        #sgd.predict(output_set),
                        #gnb.predict_proba(output_set),
                        [el for el in model.predict_generator(
                            lm_batchify(
                               to_predict_input, 
                               output_set, [0 for _ in output_set], 
                               len(to_predict_input)
                           ), 
                       steps = 1
                        )]
                            ):
    augmented_output_set.append(sum([l[-1] for l in el])/len(el))

with open("out.csv", "w+") as f:
    f.write("id,category\n")
    for i, v in enumerate([int(el + t) for el in augmented_output_set]):
        f.write("%d,%d\n" % (i, v))

## CNN

This section is not to be used for the final output.

In [1]:
# stopping notebook run
assert 1 == 0

AssertionError: 

### Utilities

Two functions:

- `content_generator`: yields inputs and outputs, line by line
- `batchify`: keras-ready generator creating batches using `content_generator`

In [None]:
LEN_AUTHORS = max([len(n['authors_int']) for n in nodes])

def cnn_content_generator(source, other_features, output, infinite=True):
    while True:
        for el, f, o in zip(source, other_features, output):
            node_a, node_b = nodes_dict[el[0]], nodes_dict[el[1]]
            yield (node_a['authors_int'],
                   node_b['authors_int'],
                   node_a['journal_int'],
                   node_b['journal_int'],
                   node_a['title_int'],
                   node_b['title_int'],
                   node_a['abstract_int'],
                   node_b['abstract_int'],
                   f,
                   o
                   )
        if not infinite:
            break


def cnn_batchify(source, other_features, output, batch_size):
    generator = cnn_content_generator(source, other_features, output)
    try:
        while True:
            inp = {'authors_1': np.full((batch_size, LEN_AUTHORS), len(author_to_int)),
                   'authors_2': np.full((batch_size, LEN_AUTHORS), len(author_to_int)),
                   'journal_1': np.empty((batch_size,)),
                   'journal_2': np.empty((batch_size,)),
                   'titles_1': np.empty((batch_size, LEN_TITLE)),
                   'titles_2': np.empty((batch_size, LEN_TITLE)),
                   'abstracts_1': np.empty((batch_size, LEN_ABSTRACT)),
                   'abstracts_2': np.empty((batch_size, LEN_ABSTRACT)),
                   'other_features': np.empty((batch_size, len(training_set[0]))),
                   }
            out = np.empty((batch_size))
            for i in range(batch_size):
                el = next(generator)
                inp['authors_1'][i][:len(el[0])] = el[0]
                inp['authors_2'][i][:len(el[1])] = el[1]
                inp['journal_1'][i] = el[2]
                inp['journal_2'][i] = el[3]
                inp['titles_1'][i] = el[4]
                inp['titles_2'][i] = el[5]
                inp['abstracts_1'][i] = el[6]
                inp['abstracts_2'][i] = el[7]
                inp['other_features'][i] = el[8]
                out[i] = el[9]
            yield inp, {'output':out, 'cnn_output':out}
    except StopIteration:
        yield inp, {out, out}

### Model definition

In [None]:
T_FEATURES = 96
A_FEATURES = 96
DROPOUT = 0.25
TRAIN_EMBEDDING = False
JOURNAL_DIM = 8
AUTHORS_DIM = 6

authors_1 = Input(shape=(LEN_AUTHORS,), name='authors_1')
authors_2 = Input(shape=(LEN_AUTHORS,), name='authors_2')

journal_1 = Input(shape=(1,), name='journal_1')  # Abstract 1
journal_2 = Input(shape=(1,), name='journal_2')  # Abstract 2

title_1 = Input(shape=(LEN_TITLE,), name='titles_1')
title_2 = Input(shape=(LEN_TITLE,), name='titles_2')

abstract_1 = Input(shape=(LEN_ABSTRACT,), name='abstracts_1')  # Abstract 1
abstract_2 = Input(shape=(LEN_ABSTRACT,), name='abstracts_2')  # Abstract 2

other_features = Input(shape=(len(training_set[0]),), name='other_features')

embedding_authors = Embedding(input_dim=len(author_to_int) + 1,
                        output_dim=AUTHORS_DIM,
                        input_length=LEN_AUTHORS,
                        trainable=TRAIN_EMBEDDING,
                        )

embedding_journal = Embedding(input_dim=len(journal_to_int),
                        output_dim=JOURNAL_DIM,
                        input_length=1,
                        trainable=TRAIN_EMBEDDING,
                        )

embedding_abstract = Embedding(input_dim=len(word_to_int) + 1,
                        output_dim=W2V_VEC_DIM,
                        weights=[embeddings],
                        input_length=LEN_ABSTRACT,
                        trainable=TRAIN_EMBEDDING,
                        )

embedding_title = Embedding(input_dim=len(word_to_int) + 1,
                        output_dim=W2V_VEC_DIM,
                        weights=[embeddings],
                        input_length=LEN_TITLE,
                        trainable=TRAIN_EMBEDDING,
                        )

journal_1_output = Flatten()(embedding_journal(journal_1))
journal_2_output = Flatten()(embedding_journal(journal_2))

authors_1_output = LSTM(AUTHORS_DIM)(embedding_authors(authors_1))
authors_2_output = LSTM(AUTHORS_DIM)(embedding_authors(authors_2))

title_1_cnn_1 = Conv1D(T_FEATURES, 3,
               padding='same',
               strides=1,
               activation='elu',
               input_shape=(LEN_TITLE, W2V_VEC_DIM)
               )
title_1_cnn_2 = Conv1D(T_FEATURES, 3,
               strides=1,
               padding='same',
               activation='elu',
               input_shape=(LEN_TITLE, W2V_VEC_DIM)
               )

title_2_cnn_1 = Conv1D(T_FEATURES, 3,
               padding='same',
               strides=1,
               activation='elu',
               input_shape=(LEN_TITLE, W2V_VEC_DIM)
               )
title_2_cnn_2 = Conv1D(T_FEATURES, 3,
               strides=1,
               padding='same',
               activation='elu',
               input_shape=(LEN_TITLE, W2V_VEC_DIM)
               )

abstract_1_cnn_1 = Conv1D(A_FEATURES, 7,
                   strides=1,  # 1
                   padding='same',
                   activation='elu',
                   input_shape=(LEN_ABSTRACT, W2V_VEC_DIM)
                   )
abstract_1_cnn_2 = Conv1D(A_FEATURES, 7,
                   strides=1,  # 1
                   padding='same',
                   activation='elu',
                   input_shape=(LEN_ABSTRACT, W2V_VEC_DIM)
                   )

abstract_2_cnn_1 = Conv1D(A_FEATURES, 5,
                   strides=1,  # 1
                   padding='same',
                   activation='elu',
                   input_shape=(LEN_ABSTRACT, W2V_VEC_DIM)
                   )
abstract_2_cnn_2 = Conv1D(A_FEATURES, 5,
                   strides=1,  # 1
                   padding='same',
                   activation='elu',
                   input_shape=(LEN_ABSTRACT, W2V_VEC_DIM)
                   )

title_1_output = GlobalMaxPool1D()(
    Dropout(DROPOUT)(
        title_1_cnn_2(
            MaxPooling1D()(
                Dropout(DROPOUT)(
                    title_1_cnn_1(
                        embedding_title(title_1)
))))))
                                   
title_2_output = GlobalMaxPool1D()(
    Dropout(DROPOUT)(
        title_2_cnn_2(
            MaxPooling1D()(
                Dropout(DROPOUT)(
                    title_2_cnn_1(
                        embedding_title(title_2)
))))))
                                   
abstract_1_output = GlobalMaxPool1D()(
    Dropout(DROPOUT)(
        abstract_1_cnn_2(
            MaxPooling1D()(
                Dropout(DROPOUT)(
                    abstract_1_cnn_1(
                        embedding_abstract(abstract_1)
))))))
                                      
abstract_2_output = GlobalMaxPool1D()(
    Dropout(DROPOUT)(
        abstract_2_cnn_2(
            MaxPooling1D()(
                Dropout(DROPOUT)(
                    abstract_2_cnn_1(
                        embedding_abstract(abstract_2)
))))))

dense_input = concatenate_layers([authors_1_output,
                                  authors_2_output,
                                  journal_1_output,
                                  journal_2_output,
                                  title_1_output, 
                                  title_2_output, 
                                  abstract_1_output, 
                                  abstract_2_output, 
                                  other_features])

dense_1 = Dropout(.40)(Dense(256, activation='elu')(dense_input))
dense_2 = Dropout(.40)(Dense(128, activation='elu')(dense_1))
output = Dense(1, activation='sigmoid', name="output")(dense_2)

cnn_input = concatenate_layers([title_1_output, 
                                  title_2_output, 
                                  abstract_1_output, 
                                  abstract_2_output])

cnn_1 = Dense(128, activation='elu')(cnn_input)
cnn_output = Dense(1, activation='sigmoid', name='cnn_output')(cnn_1)


model = Model(inputs=[authors_1, 
                      authors_2, 
                      journal_1, 
                      journal_2, 
                      title_1, 
                      title_2, 
                      abstract_1, 
                      abstract_2, 
                      other_features], outputs=(output, cnn_output))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy']
              )

display(SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg')))

### Model training

In [None]:
BATCH_SIZE = 128
N_EPOCHS = 20

early_stopping = EarlyStopping(monitor='val_output_acc',
                               patience=3,
                               mode='max',
                               verbose=1)

checkpointer = ModelCheckpoint(filepath="cnn-{val_output_acc:.4f}.hdf5",
                               monitor='val_output_acc',
                               save_best_only=True,
                               verbose=1)

reduce_lr = ReduceLROnPlateau(monitor='val_output_acc',
                              factor=0.33,
                              patience=1,
                              cooldown=1,
                              verbose=True)


model.fit_generator(
    cnn_batchify(training_indexes, training_set, training_set_output, BATCH_SIZE),
    epochs=N_EPOCHS,
    steps_per_epoch=int(len(training_set)/BATCH_SIZE)+1,
    validation_data=cnn_batchify(testing_indexes, testing_set, testing_set_output, BATCH_SIZE),
    validation_steps=int(len(testing_set)/BATCH_SIZE)+1,
    callbacks=[early_stopping, checkpointer, reduce_lr],
    use_multiprocessing=False
)

### Output generation

In [None]:
def inputify_gen(source: List[List], out, batch_size=512):
    while True:
        o_s = 0
        features = np.empty((batch_size, len(
            list(entry(nodes_dict[source[0][0]], nodes_dict[source[0][0]])))))
        for i, el in enumerate(source):
            node_a = nodes_dict[el[0]]
            node_b = nodes_dict[el[1]]
            features[i % batch_size] = entry(node_a, node_b)
            if i % batch_size == 0 and i != 0:
                yield normaliser.transform(features), out[o_s:i]
                o_s = i
        yield normaliser.transform(features[:i % batch_size]), out[o_s:i]


normaliser = StandardScaler()
sample = list(inputify(training_input[:10000]))
normaliser.fit(sample)
i = int(len(training_input) * .9)



model.fit_generator(inputify_gen(training_input[:i], training_output[:i], 512),
                    steps_per_epoch=int(i/512),
                    validation_data=inputify_gen(
                        training_input[i:], training_output[i:], 512),
                    validation_steps=int((len(training_input)-i)/512),
                    epochs=N_EPOCHS,
                    callbacks=[early_stopping, checkpointer, reduce_lr],
                    )

In [None]:
gen = content_generator(
    training_input[len(train_test_i):],
    test_test_i,
    test_test_o
)

inp = {'titles_1': np.empty((len(test_test_i), LEN_TITLE)),
       'titles_2': np.empty((len(test_test_i), LEN_TITLE)),
       'abstracts_1': np.empty((len(test_test_i), LEN_ABSTRACT)),
       'abstracts_2': np.empty((len(test_test_i), LEN_ABSTRACT)),
       'other_features': np.empty((len(test_test_i), len(train_test_i[0]))),
       }

for i, (_, el) in enumerate(zip(test_test_i, gen)):
    inp['titles_1'][i] = el[0]
    inp['titles_2'][i] = el[1]
    inp['abstracts_1'][i] = el[2]
    inp['abstracts_2'][i] = el[3]
    inp['other_features'][i] = el[5]





predictions = model.predict(inp)

threshold = thresholdify(predictions[0], test_test_o)
f1 = mean_f1(predictions[0], test_test_o, threshold)
print("Mean F1 : %f\nThreshold : %f" % (f1, threshold))

threshold = thresholdify(predictions[1], test_test_o)
f1 = mean_f1(predictions[1], test_test_o, threshold)
print("Mean F1 : %f\nThreshold : %f" % (f1, threshold))

In [None]:
testing_set = inputify(testing_input)
testing_set = normaliser.transform(testing_set)

inp = {'titles_1': np.empty((len(testing_set), LEN_TITLE)),
       'titles_2': np.empty((len(testing_set), LEN_TITLE)),
       'abstracts_1': np.empty((len(testing_set), LEN_ABSTRACT)),
       'abstracts_2': np.empty((len(testing_set), LEN_ABSTRACT)),
       'other_features': np.empty((len(testing_set), len(train_test_i[0]))),
       }

gen = content_generator(
    testing_input,
    testing_set,
    list(range(len(test_test_o)))
)

for i, (_, el) in enumerate(zip(testing_set, gen)):
    inp['titles_1'][i] = el[0]
    inp['titles_2'][i] = el[1]
    inp['abstracts_1'][i] = el[2]
    inp['abstracts_2'][i] = el[3]
    inp['other_features'][i] = el[5]

testing_output = [int(el + .5) for el in model.predict(inp)[0]]

with open("out.csv", "w+") as f:
    f.write("id,category\n")
    for i, v in enumerate(testing_output):
        f.write("%d,%d\n" % (i, v))