## Imports

In [None]:

import random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics import f1_score
from sklearn import preprocessing
import nltk
import csv
import re
from pprint import pprint

import networkx as nx

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore


In [None]:

nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = list(nltk.corpus.stopwords.words("english"))
stpwds.extend(['from', 'subject', 're', 'edu', 'use'])
stpwds = set(stpwds)
stemmer = nltk.stem.PorterStemmer()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pip install igraph



In [None]:
import igraph

## Google Drive connection

In [None]:
import os
import glob
from google.colab import drive

## connect your drive with the notebook
drive = drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
project_folder = "drive/My Drive/MLNS"

## Data loading

In [None]:

with open(project_folder + "/training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]


In [None]:

with open(project_folder + "/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]


In [None]:
testing_set[:5]

[['9807076', '9807139'],
 ['109162', '1182'],
 ['9702187', '9510135'],
 ['111048', '110115'],
 ['9910176', '9410073']]

In [None]:

with open(project_folder + "/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]


In [None]:
node_info[0]

['1001',
 '2000',
 'compactification geometry and duality',
 'Paul S. Aspinwall',
 '',
 'these are notes based on lectures given at tasi99 we review the geometry of the moduli space of n 2 theories in four dimensions from the point of view of superstring compactification the cases of a type iia or type iib string compactified on a calabi-yau threefold and the heterotic string compactified on k3xt2 are each considered in detail we pay specific attention to the differences between n 2 theories and n 2 theories the moduli spaces of vector multiplets and the moduli spaces of hypermultiplets are reviewed in the case of hypermultiplets this review is limited by the poor state of our current understanding some peculiarities such as mixed instantons and the non-existence of a universal hypermultiplet are discussed']

In [None]:

training_set_percentage = 0.2
# randomly select a percentage of training set
to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*training_set_percentage)))
training_set_reduced = [training_set[i] for i in to_keep]


## Data visualization

In [None]:

node_df = pd.DataFrame(
    node_info, 
    columns=["id", "publication_year", "title", 
             "authors", "journal_name", "abstract"]
)

node_df.head()


Unnamed: 0,id,publication_year,title,authors,journal_name,abstract
0,1001,2000,compactification geometry and duality,Paul S. Aspinwall,,these are notes based on lectures given at tas...
1,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
2,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
3,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
4,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...


## Preprocessing

### Graph generation

In [None]:

edges = [(element[0],element[1]) for element in training_set if element[2]=="1"]

G = nx.Graph()

nodes = IDs

G.add_nodes_from(nodes)

G.add_edges_from(edges)


In [None]:

# Degree Centrality measure
deg_centrality = nx.degree_centrality(G)

# Betweeness centrality measure
# betweeness_centrality = nx.betweenness_centrality(G)
betweeness_centrality = None


### Topic modeling

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stpwds] for doc in texts]


In [None]:

def topic_modeling(text_df, num_topics=5):
    # Remove punctuation
    preprocessed_texts = text_df.map(lambda x: re.sub('[,\.!?]', '', x))
    # Convert the titles to lowercase
    preprocessed_texts = preprocessed_texts.map(lambda x: x.lower())
    
    # Convert to words
    texts_data = preprocessed_texts.to_list()
    texts_words = list(sent_to_words(texts_data))
    # Remove stop words
    texts_words = remove_stopwords(texts_words)
    
    # Create Dictionary
    id2word = Dictionary(texts_words)
    # Create Corpus
    texts = texts_words
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # Build LDA model
    lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)

    pprint(lda_model.print_topics())

    lda_topics = lda_model[corpus]

    return lda_topics


In [None]:

title_lda = topic_modeling(node_df["title"], num_topics=5)


[(0,
  '0.060*"theory" + 0.021*"yang" + 0.021*"mills" + 0.017*"gauge" + 0.017*"ads" '
  '+ 0.017*"string" + 0.017*"branes" + 0.012*"theories" + 0.011*"duality" + '
  '0.011*"supersymmetric"'),
 (1,
  '0.032*"dimensional" + 0.023*"black" + 0.017*"model" + 0.017*"two" + '
  '0.015*"quantum" + 0.015*"string" + 0.013*"holes" + 0.010*"theory" + '
  '0.010*"non" + 0.009*"hole"'),
 (2,
  '0.021*"field" + 0.019*"brane" + 0.016*"models" + 0.015*"non" + '
  '0.014*"quantum" + 0.014*"theory" + 0.012*"supersymmetric" + 0.011*"gravity" '
  '+ 0.009*"symmetry" + 0.007*"gauge"'),
 (3,
  '0.030*"theory" + 0.023*"gauge" + 0.023*"field" + 0.022*"theories" + '
  '0.020*"quantum" + 0.016*"string" + 0.011*"action" + 0.011*"space" + '
  '0.011*"effective" + 0.010*"model"'),
 (4,
  '0.021*"string" + 0.021*"field" + 0.017*"theory" + 0.014*"branes" + '
  '0.013*"theories" + 0.012*"conformal" + 0.012*"model" + 0.011*"quantum" + '
  '0.010*"gravity" + 0.010*"gauge"')]


In [None]:

abstract_lda = topic_modeling(node_df["abstract"], num_topics=10)


[(0,
  '0.023*"theory" + 0.014*"dimensional" + 0.013*"string" + 0.011*"gauge" + '
  '0.009*"type" + 0.009*"theories" + 0.009*"field" + 0.008*"two" + '
  '0.008*"model" + 0.007*"branes"'),
 (1,
  '0.022*"theory" + 0.011*"field" + 0.009*"gauge" + 0.009*"one" + 0.007*"yang" '
  '+ 0.006*"mills" + 0.006*"loop" + 0.006*"non" + 0.006*"order" + 0.006*"two"'),
 (2,
  '0.025*"theory" + 0.013*"field" + 0.013*"string" + 0.010*"quantum" + '
  '0.007*"space" + 0.007*"dimensional" + 0.006*"non" + 0.006*"one" + '
  '0.005*"energy" + 0.005*"algebra"'),
 (3,
  '0.021*"theory" + 0.007*"non" + 0.007*"theories" + 0.007*"action" + '
  '0.006*"string" + 0.006*"dimensional" + 0.006*"field" + 0.006*"models" + '
  '0.006*"gauge" + 0.006*"solutions"'),
 (4,
  '0.012*"model" + 0.010*"black" + 0.009*"brane" + 0.009*"quantum" + '
  '0.008*"two" + 0.008*"dimensional" + 0.008*"ads" + 0.008*"field" + '
  '0.007*"theory" + 0.007*"hole"'),
 (5,
  '0.016*"theory" + 0.012*"string" + 0.012*"field" + 0.011*"gauge" + '
  '0

### TF-IDF vectorization

In [None]:

# compute TFIDF vector of each paper
corpus = [element[5] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(corpus)
features_TFIDF_names = vectorizer.get_feature_names_out()


In [None]:
features_TFIDF[:5]

<5x25043 sparse matrix of type '<class 'numpy.float64'>'
	with 228 stored elements in Compressed Sparse Row format>

In [None]:

def tfidf_top_words(features_TFIDF, vectorizer, id, k=30):
    tfidf_vector = features_TFIDF[id]
    tfidf_vector = np.array(tfidf_vector.todense())[0]
    top_k_indices = np.argsort(-tfidf_vector)[:k]
    return list(features_TFIDF_names[top_k_indices])


### Feature computation

In [None]:

def preprocessing_graph(graph, ds, node_df, 
                        deg_centrality, betweeness_centrality=None, 
                        train=True):

    source_degree_centrality, target_degree_centrality, diff_bt = [], [], []
    pref_attach, aai, jacard_coeff = [], [], []
    common_neigh = []
    overlap_title, temp_diff, comm_auth = [], [], []
    comm_top_words, abstract_sim, same_journal_name = [], [], []
    comm_title_topics, comm_abstract_topics = [], []

    counter = 0
    for i, edge in enumerate(ds):
        if train:
            source, target, label = edge
        else:
            source, target = edge
        
        index_source = IDs.index(source)
        index_target = IDs.index(target)

        # Degree Centrality
        source_degree_centrality.append(deg_centrality[source])
        target_degree_centrality.append(deg_centrality[target])
        
        if betweeness_centrality is not None:
            # Betweeness centrality measure 
            diff_bt.append(betweeness_centrality[target] - betweeness_centrality[source])

        # Preferential Attachement 
        pref_attach.append(list(nx.preferential_attachment(graph, [(source, target)]))[0][2])

        # AdamicAdar
        aai.append(list(nx.adamic_adar_index(graph, [(source, target)]))[0][2])

        # Jaccard
        jacard_coeff.append(list(nx.jaccard_coefficient(graph, [(source, target)]))[0][2])

        # Number of common neighobrs
        common_neigh.append(len(sorted(nx.common_neighbors(graph, source, target))))

        # Node information features
        source_info = node_df.loc[node_df.id == source].iloc[0]
        target_info = node_df.loc[node_df.id == target].iloc[0]
        
        # convert to lowercase and tokenize
        source_title = source_info["title"].lower().split(" ")
        # remove stopwords
        source_title = [token for token in source_title if token not in stpwds]
        source_title = [stemmer.stem(token) for token in source_title]
        
        target_title = target_info["title"].lower().split(" ")
        target_title = [token for token in target_title if token not in stpwds]
        target_title = [stemmer.stem(token) for token in target_title]
        
        # Author lists
        source_auth = source_info["authors"].split(",")
        target_auth = target_info["authors"].split(",")

        # TFIDF
        source_top_10_words = tfidf_top_words(features_TFIDF, vectorizer, index_source)
        target_top_10_words = tfidf_top_words(features_TFIDF, vectorizer, index_target)
        abstract_sim.append(cosine_similarity(features_TFIDF[index_source], features_TFIDF[index_target])[0][0])

        # Journal name
        source_journal = source_info["journal_name"].lower().strip()
        target_journal = target_info["journal_name"].lower().strip()

        # Title and abstract topic
        source_title_topic = title_lda[index_source][0][0]
        target_title_topic = title_lda[index_target][0][0]
        source_abstract_topics = [topic for topic, score in abstract_lda[index_source]]
        target_abstract_topics = [topic for topic, score in abstract_lda[index_target]]
        
        overlap_title.append(len(set(source_title).intersection(set(target_title))))
        temp_diff.append(int(source_info["publication_year"]) - int(target_info["publication_year"]))
        comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
        comm_top_words.append(len(set(source_top_10_words).intersection(set(target_top_10_words))))
        same_journal_name.append(1 if len(source_journal) > 0 and source_journal == target_journal else 0)
        comm_title_topics.append(int(source_title_topic==target_title_topic))
        comm_abstract_topics.append(len(set(source_abstract_topics).intersection(set(target_abstract_topics))))
      
        counter += 1
        if counter % 1000 == 0:
            print(f"{counter}/{len(ds)} examples processsed")
        # break

    # convert list of lists into array
    # documents as rows, unique words as columns (i.e., example as rows, features as columns)
    if betweeness_centrality is not None:
        features = np.array([source_degree_centrality, target_degree_centrality, 
                             diff_bt, pref_attach, aai, jacard_coeff, common_neigh,
                             overlap_title, temp_diff, comm_auth, 
                             comm_top_words, abstract_sim, same_journal_name, 
                             comm_title_topics, comm_abstract_topics]).T
    else:
        features = np.array([source_degree_centrality, target_degree_centrality, 
                             pref_attach, aai, jacard_coeff, common_neigh,
                             overlap_title, temp_diff, comm_auth, 
                             comm_top_words, abstract_sim, same_journal_name, 
                             comm_title_topics, comm_abstract_topics]).T
    # print(features)
    print(f"{len(features[0])} features computed.")

    # scale
    features = preprocessing.scale(features)

    if train:
        # convert labels into integers then into column array
        labels = [int(element[2]) for element in ds]
        labels = list(labels)
        labels = np.array(labels)

        return features, labels
    else:
        return features


In [None]:

training_features, training_labels = preprocessing_graph(
    G, training_set_reduced, node_df, 
    deg_centrality, betweeness_centrality, 
    train=True
)

np.save(project_folder + "/training_features.npy", training_features)
np.save(project_folder + "/training_labels.npy", training_labels)


1000/307756 examples processsed
2000/307756 examples processsed
3000/307756 examples processsed
4000/307756 examples processsed
5000/307756 examples processsed
6000/307756 examples processsed
7000/307756 examples processsed
8000/307756 examples processsed
9000/307756 examples processsed
10000/307756 examples processsed
11000/307756 examples processsed
12000/307756 examples processsed
13000/307756 examples processsed
14000/307756 examples processsed
15000/307756 examples processsed
16000/307756 examples processsed
17000/307756 examples processsed
18000/307756 examples processsed
19000/307756 examples processsed
20000/307756 examples processsed
21000/307756 examples processsed
22000/307756 examples processsed
23000/307756 examples processsed
24000/307756 examples processsed
25000/307756 examples processsed
26000/307756 examples processsed
27000/307756 examples processsed
28000/307756 examples processsed
29000/307756 examples processsed
30000/307756 examples processsed
31000/307756 exampl

In [None]:

testing_features = preprocessing_graph(
    G, testing_set, node_df, 
    deg_centrality, betweeness_centrality, 
    train=False
)

np.save(project_folder + "/testing_features.npy", testing_features)


1000/32648 examples processsed
2000/32648 examples processsed
3000/32648 examples processsed
4000/32648 examples processsed
5000/32648 examples processsed
6000/32648 examples processsed
7000/32648 examples processsed
8000/32648 examples processsed
9000/32648 examples processsed
10000/32648 examples processsed
11000/32648 examples processsed
12000/32648 examples processsed
13000/32648 examples processsed
14000/32648 examples processsed
15000/32648 examples processsed
16000/32648 examples processsed
17000/32648 examples processsed
18000/32648 examples processsed
19000/32648 examples processsed
20000/32648 examples processsed
21000/32648 examples processsed
22000/32648 examples processsed
23000/32648 examples processsed
24000/32648 examples processsed
25000/32648 examples processsed
26000/32648 examples processsed
27000/32648 examples processsed
28000/32648 examples processsed
29000/32648 examples processsed
30000/32648 examples processsed
31000/32648 examples processsed
32000/32648 examp

## Model fitting & predictions

In [None]:

names = [
    "Nearest Neighbors",
    # "Linear SVM",
    # "RBF SVM",
    # "Gaussian Process",
    # "Decision Tree",
    "Random Forest",
    "Neural Net 2",
    "Neural Net 3",
    "AdaBoost",
    # "Naive Bayes",
    # "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    # SVC(kernel="linear", C=0.025),
    # SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0)),
    # DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(max_depth=10, n_estimators=100),
    MLPClassifier(hidden_layer_sizes=(100, 100), alpha=1e-2, max_iter=1000),
    MLPClassifier(hidden_layer_sizes=(50, 50, 20), alpha=1e-2, max_iter=1000),
    AdaBoostClassifier(),
    # GaussianNB(),
    # QuadraticDiscriminantAnalysis(),
]


In [None]:

"""
# Neural Network layer size tuning

names = []
classifiers = []

for l1_size in [20, 50, 100]:
    for l2_size in [20, 50, 100]:
        names.append(f"Neural Net {l1_size} {l2_size}")
        classifiers.append(MLPClassifier(
            hidden_layer_sizes=(l1_size,l2_size, 20), alpha=1e-2, max_iter=1000))
"""


In [None]:

training_features = np.load(project_folder + "/training_features.npy")
training_labels = np.load(project_folder + "/training_labels.npy")
testing_features = np.load(project_folder + "/testing_features.npy")

for name, classifier in zip(names, classifiers):
    print(f"Fitting '{name}' classifier...")
    
    # train
    classifier.fit(training_features, training_labels)

    print("Predicting labels on the training set...")
    training_predictions = list(classifier.predict(training_features))
    training_score = f1_score(training_labels, training_predictions)
    print("Training F1-Score:", training_score)

    print("Predicting labels on the testing set...")
    # issue predictions
    predictions= list(classifier.predict(testing_features))

    # write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
    predictions = zip(range(len(testing_features)), predictions)

    with open(project_folder + "/predictions_{}.csv".format(name), "w") as pred:
        csv_out = csv.writer(pred)
        csv_out.writerow(["id", "category"])
        for row in predictions:
            csv_out.writerow(row)


## Model performances on the test set


### First batch of submission

Test results for 10 features on 5% of the training data:

|Model name| F1-Score|
|------|------|
|Nearest Neighbors|**0.95497**|
|Linear SVM|0.95123|
|RBF SVM|**0.95950**|
|Gaussian Process|Not enough RAM|
|Decision Tree|0.93841|
|Random Forest|0.93517|
|Neural Net|**0.95499**|
|AdaBoost|**0.93997**|
|Naive Bayes|0.92907|
|QDA|0.80821|

Features:

*   **source_degree_centrality:** Degree centrality of the source node;
*   **target_degree_centrality:** Degree centrality of the target node;
*   **pref_attach:** Preferential attachement score of the two nodes;
*   **aai:** Adamic Adar index of the two nodes;
*   **jacard_coeff:** Jaccard coefficient of the two nodes;
*   **overlap_title:** Number of common words in the paper titles;
*   **temp_diff:** Difference in publication years of the two papers;
*   **comm_auth:** Number of common authors in the papers;
*   **comm_top_words:** Number of common words within the words with the 10 highest TF-IDF score (in the abstract);
*   **same_journal_name:** Boolean indicating if the papers where published in the same journal.



### Second batch of submission

Test results for 11 features on 20% of the training data:

|Model name| F1-Score|
|------|------|
|Nearest Neighbors|0.96126|
|RBF SVM|**0.96853**|
|Neural Net 1 hidden layer|0.96132|
|Neural Net 2 hidden layers|**0.97121**|
|AdaBoost|**0.96768**|

Additional feature (the other 10 features are the same as the first batch):

*   **diff_bt:** Difference between the betweeness centralities of the two nodes (high computation needs).



### Third batch of submission

Test results for 14 features on 20% of the training data:

|Model name|Train F1-Score| Test F1-Score|
|------|------|------|
|Nearest Neighbors|0.97571|0.96243|
|RBF SVM|Prediction time out|0.96824|
|Random Forest|0.97609|0.97441|
|Neural Net 2 hidden layers|0.97513|0.97430|
|Neural Net 3 hidden layers|0.97500|**0.97535**|
|AdaBoost|0.97092|0.97127|

Additional features (the other 10 features are the same as the first batch):

*   **common_neigh:** Number of common neighbors between the two nodes;
*   **abstract_sim:** Cosine similarity between the TF-IDF vetors of the papers' abstracts;
*   **comm_title_topics:** Boolean indicating if the main topic detected in the paper titles is the same;
*   **comm_abstract_topics:** Number of common topics detected in the abstracts.




### Neural Network layer size tuning

Note that the third layer was fixed at 20 nodes and the maximum size tested was 100 because of computation time constraints.

|First layer size|Second layer size|Train F1-Score|Test F1-Score|
|------|------|------|------|
|20|20|0.97419|0.97477|
|20|50|0.97445|0.97444|
|20|100|0.97422|0.97449|
|50|20|0.97461|0.97407|
|50|50|0.97509|0.97457|
|50|100|0.97516|0.97421|
|100|20|0.97541|**0.9479**|
|100|50|0.97499|0.97478|
|100|100|**0.97592**|0.97477|