# Load Packages


In [67]:
import random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn import preprocessing
import nltk
import csv
import re
import math
from pprint import pprint
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt


import networkx as nx
from networkx import betweenness_centrality
from networkx import edge_betweenness_centrality
from networkx import load_centrality
from networkx import eigenvector_centrality

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

import en_core_web_sm

%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Datasets

In [3]:
nodes_info_df = pd.read_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/cs-mlns-22/node_information.csv')
random_preds_df = pd.read_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/cs-mlns-22/random_predictions.csv') 
test_set = pd.read_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/cs-mlns-22/testing_set.txt', sep = ' ', header = None)
train_set = pd.read_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/cs-mlns-22/training_set.txt', sep = ' ', header = None)
test_set.columns = ['source_id', 'target_id']
train_set.columns = ['source_id', 'target_id', 'label']
nodes_info_df.columns = ['paper_id', 'publication_year', 'title', 'author', 'journal_name', 'abstract']

In [8]:
nodes_info_df.head()

Unnamed: 0,paper_id,publication_year,title,author,journal_name,abstract
0,1002,2000,domain walls and massive gauged supergravity p...,"M. Cvetic, H. Lu, C.N. Pope",Class.Quant.Grav.,we point out that massive gauged supergravity ...
1,1003,2000,comment on metric fluctuations in brane worlds,"Y.S. Myung, Gungwon Kang",,recently ivanov and volovich hep-th 9912242 cl...
2,1004,2000,moving mirrors and thermodynamic paradoxes,Adam D. Helfer,Phys.Rev.,quantum fields responding to moving mirrors ha...
3,1005,2000,bundles of chiral blocks and boundary conditio...,"J. Fuchs, C. Schweigert",,proceedings of lie iii clausthal july 1999 var...
4,1006,2000,questions in quantum physics,Rudolf Haag,,an assessment of the present status of the the...


In [6]:
train_set.head()

Unnamed: 0,source_id,target_id,label
0,9510123,9502114,1
1,9707075,9604178,1
2,9312155,9506142,0
3,9911255,302165,0
4,9701033,209076,0


# Process the Data

In [11]:
print(f'There are ', len(set(nodes_info_df['paper_id'])), ' Unique papers: ')
sym_diff = set(test_set['source_id'].append(test_set['target_id'])).symmetric_difference(set(nodes_info_df['paper_id']))
print('Unknown papers in test set (with nodes_info):', len(sym_diff))

There are  27769  Unique papers: 
Unknown papers in test set (with nodes_info): 4369


In [14]:
# Use Spacy
spacy_nlp = en_core_web_sm.load(disable=["tagger", 
                                         "parser",
                                         "ner",
                                         "entity_linker",
                                         "textcat",
                                         "entity_ruler",
                                         "sentencizer",
                                         "merge_noun_chunks",
                                         "merge_entities",
                                         "merge_subtokens"])

## Do some Features on Text

In [17]:
def isNaN(string):
    return string != string

def filter_bad(alphabet):
    if (alphabet in [',', None]):
        return False
    else:
        return True

In [30]:
a = ['J.-M. Chung', 'B. K. Chung']
' '.join(filter(lambda x: x not in [',', None], a))
# ' '.join(filter(filter_bad, a))

'J.-M. Chung B. K. Chung'

### Handle Author Names

In [23]:
nodes_info_df.author.sample(10)

27430                             J.-M. Chung, B. K. Chung
1862     Csaba Csaki, Michael L. Graesser, Graham D. Kribs
4864                                                   NaN
10023                                   Oscar Loaiza-Brito
15892    P.S. Howe, J.M. Izquierdo, G. Papadopoulos, P....
2258                      Ch, rashekar Devch, , Jean Nuyts
5846                  Grigorii B. Pivovarov, James P. Vary
15056                                                  NaN
2565      D.Z Freedman (MIT), P. Henry-Labordere (LPT-ENS)
7400                                     Joseph C. Varilly
Name: author, dtype: object

In [40]:
# 看了一眼，作者名字有很多类，大部分是通过,来区别的，但有时也会加入学校名字，有时会加first name，有时会是first name + some letter + last name

def author_normalisation(authors):
    # 先排除NaN的
    if not isNaN(authors) == False:
        return np.NaN

    authors = authors.lower()
    final_authors = []
    
    # 去掉学校
    if '(' in authors: # 学校的都是加在括号里的，但是这里我还没有考虑多个人全部加了学校的情况！！
        authors = re.sub(r'\(+.*\)', '', authors).strip() 
    
    # remove extra spaces
    authors = authors.split()
    authors = ' '.join(filter(lambda x: x not in [',', None], authors))
      
    # get all authors of one paper 
    for author in authors.split(', '): 
        author.strip()            
        # get the names of an author
        names = author.split(' ')
        author_names = list()        
        if len(names) == 2:
            # check if first element is 'letter.' format:
            if re.match('\w\.', names[0]):
                author_names.append(names[0])
            else:
                author_names.append(names[0][0] + '.')

        if len(names) == 3:
            if re.match('\w\.', names[0]):
                author_names.append(names[0])
            else:
                author_names.append(names[0][0] + '.')

            # skip the second middle name
            if re.match('\w\.', names[1]):
                pass
                #author_names.append(names[1])
            #else:
            #    author_names.append(names[1][0] + '.')

        author_names.append(names[-1])
        if len(author_names) > 1:
            author_names = ' '.join(author_names)
        else:
            author_names = author_names[0]
        # append last name
        final_authors.append(author_names)


    number_of_authors = len(final_authors)
    if number_of_authors == 0:
        return np.NaN

    return final_authors

In [41]:
a = ' Grigorii B. Pivovarov, James P. Vary'
author_normalisation(a)

['g. pivovarov', 'j. vary']

### Common Authors

In [42]:
def common_authors(string1, string2):
    # 仅仅是判断是否有重复的而已
    
    if isNaN(string1):
        return False
    if isNaN(string2):
        return False
    
    a_set = set(string1)
    b_set = set(string2)
    
    if (a_set & b_set): 
        return True 
    else: 
        return False

### Remove Special Characts

In [43]:
def remove_special_characters(string):
    string = re.sub("([^\w]|[\d_])+", " ",  string)
    return string

### Tokenize

In [47]:
def tokenize(string): 
    # spacy真的是好用       
    # Code to tokenize
    spacy_tokens = spacy_nlp(string)
    # Code to remove punctuation tokens and create string tokens
    string_tokens = [token.lemma_ for token in spacy_tokens if not token.is_punct if not token.is_stop]  
    return string_tokens  

In [48]:
def recombining_tokens_into_a_string(list_of_tokens):
    return " ".join(list_of_tokens)

### TF-IDF

In [50]:
def create_tf_idf(column, tf_idf):

    # if tf_idf doesn't exist
    if tf_idf == None:
        #create a TfidfVectorizer object
        tf_idf = TfidfVectorizer()
        #Vectorize the sample text
        X_tfidf_sample = tf_idf.fit_transform(column)

    # if tf_idf already exist use the same for the test
    else:
        X_tfidf_sample = tf_idf.transform(column)

    return X_tfidf_sample, tf_idf

In [53]:
def tf_idf_feature(column, dataset, tf_idf, author_or_not):
    # Remove special characters from the text
    dataset[column] = dataset[column].apply(lambda x: remove_special_characters(x))

    # if we deal with the column author
    if author_or_not == 1:
        # Remove strings of size less than two
        column_cleaned = dataset[column].str.findall('\w{2,}').str.join(' ')
    
    else:
        # Tokenize, extract lemmas and remove stop words
        tokenized = dataset[column].apply(lambda x: tokenize(x)) 
        # Recombine tokens into a string
        column_cleaned = tokenized.apply(recombining_tokens_into_a_string)
    # Create the tf_idf matrix 
    tf_idf_matrix, tf_idf = create_tf_idf(column_cleaned, tf_idf)
    return tf_idf_matrix, tf_idf

### Compute Similarity 

In [65]:
# 计算相似程度，column target and source
def compute_similarity(column, df_source, df_target, author_or_not):
    # Fill the Na's
    df_source[column].fillna("unknown", inplace=True)
    df_target[column].fillna("unknown", inplace=True)

    tf_idf = None
    # Create the tf_idf features
    tf_idf_title_source, tf_idf = tf_idf_feature(column, df_source, tf_idf, author_or_not)
    tf_idf_title_target, tf_idf = tf_idf_feature(column, df_target, tf_idf, author_or_not)

    # Calculate the similarities
    similarity = []
    for i in tqdm(range(tf_idf_title_source.shape[0])):
        cos_sim = cosine_similarity(tf_idf_title_source[i], tf_idf_title_target[i])
        similarity.append(cos_sim)

    # Convert the list as a DataFrame
    similarity_df = pd.DataFrame(np.vstack(similarity))

    return similarity_df

### Use PCA to reduce dimension

In [70]:
def reduce_matrix_width(source_df, target_df, n_components):
    # Apply a PCA to reduce the matrix width , we chose 15
    pca_train = PCA(n_components=n_components)

    # PCA on source feature
    pca_train.fit(source_df)
    matrix_source_reduced = pca_train.transform(source_df)
    print(sum(pca_train.explained_variance_ratio_))  # Percentage of initial matrix explained by reduced matrix

    # PCA on target feature
    pca_train.fit(target_df)
    matrix_target_reduced = pca_train.transform(target_df)
    
    print('pca explained variance ratio is: )', sum(pca_train.explained_variance_ratio_))  # Percentage of initial matrix explained by reduced matrix

    return matrix_source_reduced, matrix_target_reduced

### Handle Journal Name Feature

In [56]:
def journal_name_feature(train_source_info, train_target_info, test_source_info, test_target_info):
    # First merge train and test to avoid a different number of features when one-hot-encoding
    # To keep trace of the train and test dataset
    train_source_info['train_test'] = 1
    train_target_info['train_test'] = 1
    test_source_info['train_test'] = 0
    test_target_info['train_test'] = 0

    # merging the two datasets together
    combined_source = pd.concat([train_source_info, test_source_info], ignore_index=True)
    combined_target = pd.concat([train_target_info, test_target_info], ignore_index=True)

    # One hot encoding
    journal_name_encoded_source = pd.get_dummies(combined_source['journal_name'])
    journal_name_encoded_target = pd.get_dummies(combined_target['journal_name'])

    # Apply PCA to reduce matrix with 15 components
    journal_name_encoded_source_reduced, journal_name_encoded_target_reduced = reduce_matrix_width(
        journal_name_encoded_source, journal_name_encoded_target, 15)
    
    # Merge encoded dataset with the combine dataset
    combined_source = pd.concat([combined_source, pd.DataFrame(journal_name_encoded_source_reduced)], axis=1)
    combined_target = pd.concat([combined_target, pd.DataFrame(journal_name_encoded_target_reduced)], axis=1)

    # Separate train and test and keep only journal_name features
    train_source_journal = combined_source[combined_source["train_test"] == 1].drop(
        ['abstract', 'author', 'journal_name', 'label', 'paper_id', 'publication_year', 'source_id', 'target_id',
         'title', 'train_test'], axis=1)
    test_source_journal = combined_source[combined_source["train_test"] == 0].drop(
        ['abstract', 'author', 'journal_name', 'label', 'paper_id', 'publication_year', 'source_id', 'target_id',
         'title', 'train_test'], axis=1)
    train_target_journal = combined_target[combined_target["train_test"] == 1].drop(
        ['abstract', 'author', 'journal_name', 'label', 'paper_id', 'publication_year', 'source_id', 'target_id',
         'title', 'train_test'], axis=1)
    test_target_journal = combined_target[combined_target["train_test"] == 0].drop(
        ['abstract', 'author', 'journal_name', 'label', 'paper_id', 'publication_year', 'source_id', 'target_id',
         'title', 'train_test'], axis=1)
    
    # add prefix to columns names
    train_source_journal.columns = [str(col) + '_source' for col in train_source_journal.columns]
    test_source_journal.columns = [str(col) + '_source' for col in test_source_journal.columns]
    train_target_journal.columns = [str(col) + '_target' for col in train_target_journal.columns]
    test_target_journal.columns = [str(col) + '_target' for col in test_target_journal.columns]

    return train_source_journal, test_source_journal, train_target_journal, test_target_journal

# Start Pre working

## Process train and test DF

In [57]:
# reaye source and target info datasets
train_source_info = train_set.merge(nodes_info_df, left_on='source_id', right_on='paper_id',how="left")
train_target_info = train_set.merge(nodes_info_df, left_on='target_id', right_on='paper_id',how="left")

test_source_info = test_set.merge(nodes_info_df, left_on='source_id', right_on='paper_id',how="left")
test_target_info = test_set.merge(nodes_info_df, left_on='target_id', right_on='paper_id',how="left")

In [60]:
## apply the features to training set 
train_set['source_authors'] = train_source_info.author.apply(lambda x: author_normalisation(x))
train_set['target_authors'] = train_target_info.author.apply(lambda x: author_normalisation(x))

train_set['publication_year_diff'] = train_source_info.publication_year - train_target_info.publication_year

train_set['source_journal'] = train_source_info.journal_name
train_set['target_journal'] = train_target_info.journal_name
train_set['same_journal'] = train_set.apply(lambda x: int(x.source_journal == x.target_journal), axis=1)

## apply the features to test set
test_set['source_authors'] = test_source_info.author.apply(lambda x: author_normalisation(x))
test_set['target_authors'] = test_target_info.author.apply(lambda x: author_normalisation(x))

test_set['publication_year_diff'] = test_source_info.publication_year - test_target_info.publication_year

test_set['source_journal'] = test_source_info.journal_name
test_set['target_journal'] = test_target_info.journal_name
test_set['same_journal'] = test_set.apply(lambda x: int(x.source_journal == x.target_journal), axis=1)

In [62]:
train_set.head()

Unnamed: 0,source_id,target_id,label,source_authors,target_authors,publication_year_diff,source_journal,target_journal,same_journal
0,9510123,9502114,1,,"[w. kim, j. lee, y. park]",0.0,Phys.Lett.,Phys.Lett.,1
1,9707075,9604178,1,"[l.e.ibanez, a.m.uranga]","[a. dabholkar, j. park]",1.0,,Nucl.Phys.,0
2,9312155,9506142,0,"[p. kleban, i. vassileva]",[v. moretti],-2.0,Phys.Rev.Lett.,Class.Quant.Grav.,0
3,9911255,302165,0,"[p. argyres, s. pell,]",[t. hollowood],-4.0,JHEP,,0
4,9701033,209076,0,,"[e. gravanis, s. willison]",-5.0,Phys.Lett.,,0


## Add other features

This one takes about an hour

In [66]:
# other features this might take some times to run
## apply the features to training set
train_set['similarity_title'] = compute_similarity("title", train_source_info, train_target_info, 0)
train_set['similarity_abstract'] = compute_similarity("abstract", train_source_info, train_target_info, 0)
train_set['similarity_author'] = compute_similarity("author", train_source_info, train_target_info, 1)

## apply features to test set
test_set['similarity_title'] = compute_similarity("title", test_source_info, test_target_info, 0)
test_set['similarity_abstract'] = compute_similarity("abstract", test_source_info, test_target_info, 0)
test_set['similarity_author'] = compute_similarity("author", test_source_info, test_target_info, 1)

100%|██████████| 615512/615512 [12:26<00:00, 824.74it/s] 
100%|██████████| 615512/615512 [10:34<00:00, 970.06it/s] 
100%|██████████| 615512/615512 [09:02<00:00, 1135.13it/s]
100%|██████████| 32648/32648 [00:28<00:00, 1126.47it/s]
100%|██████████| 32648/32648 [00:29<00:00, 1118.74it/s]
100%|██████████| 32648/32648 [00:28<00:00, 1150.69it/s]


In [71]:
# journal_name feature
train_source_journal, test_source_journal, train_target_journal, test_target_journal = journal_name_feature(train_source_info, 
                                                                                                            train_target_info, 
                                                                                                            test_source_info, 
                                                                                                            test_target_info)

# Add journal_name to the train and test
train_set = pd.concat([train_set, train_source_journal], axis=1, )
train_set = pd.concat([train_set, train_target_journal], axis=1)
test_set = pd.concat([test_set, test_source_journal.reset_index().drop(["index"], axis=1)], axis=1)
test_set = pd.concat([test_set, test_target_journal.reset_index().drop(["index"], axis=1)], axis=1)

0.8870350870770787
pca explained variance ratio is: ) 0.899128602817173


In [72]:
train_set.to_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/temp_train_set_March_20.csv')
test_set.to_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/temp_test_set_March_20.csv')

In [73]:
train_set.head()

Unnamed: 0,source_id,target_id,label,source_authors,target_authors,publication_year_diff,source_journal,target_journal,same_journal,similarity_title,...,5_target,6_target,7_target,8_target,9_target,10_target,11_target,12_target,13_target,14_target
0,9510123,9502114,1,,"[w. kim, j. lee, y. park]",0.0,Phys.Lett.,Phys.Lett.,1,0.138488,...,0.001503,0.011897,0.003587,0.000744,-0.00117,0.006011,0.001121,-0.001973,0.003986,0.001878
1,9707075,9604178,1,"[l.e.ibanez, a.m.uranga]","[a. dabholkar, j. park]",1.0,,Nucl.Phys.,0,0.171045,...,0.00101,0.008056,0.002458,0.000512,-0.000806,0.004159,0.000779,-0.001373,0.002784,0.001315
2,9312155,9506142,0,"[p. kleban, i. vassileva]",[v. moretti],-2.0,Phys.Rev.Lett.,Class.Quant.Grav.,0,0.0,...,-0.025941,-0.35114,0.871733,0.044678,-0.058764,0.176609,0.022999,-0.036765,0.063519,0.02581
3,9911255,302165,0,"[p. argyres, s. pell,]",[t. hollowood],-4.0,JHEP,,0,0.0,...,-0.007274,-0.066396,-0.026406,-0.006121,0.009934,-0.059091,-0.013188,0.024863,-0.057807,-0.032618
4,9701033,209076,0,,"[e. gravanis, s. willison]",-5.0,Phys.Lett.,,0,0.0,...,-0.007274,-0.066396,-0.026406,-0.006121,0.009934,-0.059091,-0.013188,0.024863,-0.057807,-0.032618


In [None]:
# i dont understand why this file is 800M

In [74]:
train_set.memory_usage(index=True, deep=False)

Index                 128
source_id         4924096
target_id         4924096
label             4924096
source_authors    4924096
                   ...   
10_target         4924096
11_target         4924096
12_target         4924096
13_target         4924096
14_target         4924096
Length: 73, dtype: int64

## Add some graph features

In [75]:
# get some elements and then assign the attributes
# 从别的地方抄的

def shortest_path_info(some_graph, source, target):
    if source not in some_graph.nodes():
        return -1  # not known

    if target not in some_graph.nodes():
        return -1  # not known

    if nx.has_path(some_graph, source, target):
        return nx.dijkstra_path_length(some_graph, source=source, target=target)

    return -2  # no path

In [76]:
def degree_centrality(some_graph):
    degree_dict = dict(some_graph.degree(some_graph.nodes()))
    return degree_dict

In [77]:
def get_in_out_degree(some_graph):
    in_degree_dict = dict(some_graph.in_degree(some_graph.nodes()))
    out_degree_dict = dict(some_graph.out_degree(some_graph.nodes()))
    return in_degree_dict, out_degree_dict

In [78]:
def common_neighs(some_graph, x, y):
    if x not in some_graph.nodes():
        return 0, []  # not known 
    if y not in some_graph.nodes():
        return 0, []  # not known
    neighs = sorted(list(nx.common_neighbors(some_graph, x, y)))
    return len(neighs), neighs

In [79]:
def jac_index(g, x, y):
    if x not in g.nodes():
        return -1  # not known 
    if y not in g.nodes():
        return -1  # not known
    preds = nx.jaccard_coefficient(g, [(x, y)])
    jacc = 0

    for u, v, p in preds:
        jacc = p
    return jacc

In [80]:
def pref_attachement(g, x, y):
    if x not in g.nodes():
        return -1  # not known 
    if y not in g.nodes():
        return -1  # not known
    preds = nx.preferential_attachment(g, [(x, y)])
    pref = 0

    for u, v, p in preds:
        pref = p
    return pref

In [81]:
def aa_index(g, x, y):
    if x not in g.nodes():
        return -1  # not known 
    if y not in g.nodes():
        return -1  # not known
    preds = nx.adamic_adar_index(g, [(x, y)])
    aa = 0

    for u, v, p in preds:
        aa = p
    return aa 

### Network

In [83]:
# create the network 
# get network for when there is a connection in train set
# edges = list(zip(train_set.loc[train_set.label == 1].source_id, train_set.loc[train_set.label == 1].target_id))
# nodes = list(set(train_set.source_id + train_set.target_id))

# train_G = nx.DiGraph()
# train_G.add_nodes_from(nodes)
# train_G.add_edges_from(edges)

train_G = nx.from_pandas_edgelist(train_set, 
                                  source='source_id', 
                                  target='target_id', 
                                  edge_attr=None,
                                  create_using=nx.DiGraph())

# make sure you also have an undirected graph
train_G_ud = train_G.to_undirected()

# create some dictionaries to use later on
clustering_coeff_dict = nx.clustering(train_G_ud)
avg_neigh_degree_dict = nx.average_neighbor_degree(train_G)
out_degree_centrality = nx.out_degree_centrality(train_G)
in_degree_centrality = nx.in_degree_centrality(train_G)
page_rank = nx.pagerank(train_G)
hub_score, authority_score = nx.hits(train_G)

## 考虑singlty

In [84]:
# function to get features for graph of a single element
def get_features(directed_graph, ud_graph, source_id, target_id, label):
    # features for undirected graph
    jaccard_index = jac_index(ud_graph, source_id, target_id)
    preferencial_attachment = pref_attachement(ud_graph, source_id, target_id)
    number_common_neighbours, common_neighbours = common_neighs(ud_graph, source_id, target_id)
    adamic_adar_index = aa_index(ud_graph, source_id, target_id)
    #shortest_path = shortest_path_info(train_G, source_id, target_id)

    source_pr = page_rank[source_id]
    source_hub_score = hub_score[source_id]
    source_authority_score = authority_score[source_id]
    source_cluster_coeff = clustering_coeff_dict[source_id]
    source_out_centrality = out_degree_centrality[source_id]
    source_avg_neigh_degree = avg_neigh_degree_dict[source_id]
 
    target_pr = page_rank[target_id]
    target_hub_score = hub_score[target_id]
    target_authority_score = authority_score[target_id]
    target_cluster_coeff = clustering_coeff_dict[target_id]
    target_in_centrality = in_degree_centrality[target_id]
    target_avg_neigh_degree = avg_neigh_degree_dict[target_id]

    # no name feature but supposedly important 
    feature_n = source_out_centrality * target_in_centrality
     
    return [source_id, target_id, label, jaccard_index, preferencial_attachment, 
            number_common_neighbours, adamic_adar_index, source_pr, target_pr, 
            source_hub_score, target_hub_score, source_authority_score, 
            target_authority_score, source_cluster_coeff, target_cluster_coeff, 
            source_out_centrality, target_in_centrality, source_avg_neigh_degree, 
            target_avg_neigh_degree, feature_n]

In [85]:
# 这里，把上面def的return的值全部设置成新的columns
### add columns when you add Features
column_names = ['source_id', 
                'target_id', 
                'label', 
                'jaccard_index', 
                'preferential_attachement', 
                'number_common_neighbours',  
                'adamic_adar_index', 
                'source_pr',
                'target_pr', 
                'source_hub_score', 
                'target_hub_score', 
                'source_authority_score',
                'target_authority_score', 
                'source_cluster_coeff', 
                'target_cluster_coeff',
                'source_out_centrality', 
                'target_in_centrality', 
                'source_avg_neigh_degree', 
                'target_avg_neigh_degree',
                'feature_n']

final_train_set = pd.DataFrame([[np.nan] * len(column_names)] * train_set.shape[0], 
                               columns=column_names)
final_test_set = pd.DataFrame([[np.nan] * len(column_names)] * test_set.shape[0], 
                              columns=column_names)

## Feature for train and test and merge

In [87]:
# create the features for the train set
for idx, row in tqdm(train_set.iterrows()):
    features = get_features(train_G, train_G_ud, row.source_id, row.target_id, row.label)
    #update the features
    final_train_set.loc[idx] = features

#create the features for the test set
for idx, row in tqdm(test_set.iterrows()):
    features = get_features(train_G, train_G_ud, row.source_id, row.target_id, -1)
    #update the features
    final_test_set.loc[idx] = features

615512it [08:15, 1241.44it/s]
32648it [00:25, 1288.21it/s]


In [88]:
# merge graph and text features together 
train_set = train_set.merge(final_train_set, 
                            on=['source_id', 'target_id', 'label'], 
                            how='left') 

test_set = test_set.merge(final_test_set, 
                          on=['source_id', 'target_id'], 
                          how='left')

In [89]:
def graph_features(directed_graph, dataset):
    # betweenness
    print('begin betweeness')
    between_centrality = betweenness_centrality(directed_graph) # shortest-path betweenness centrality for nodes
    
    # load centrality
    print('begin centrality')
    ld_centrality = load_centrality(directed_graph) # load centrality of a node is the fraction of all shortest paths that pass through that node
    #eigenvector centrality
    eig_centrality = eigenvector_centrality(directed_graph)
    
    # save features to training set 
    dataset['betweeness_centrality'] = pd.DataFrame.from_dict(dict(eig_centrality), orient='index')
    dataset['load_centrality'] = pd.DataFrame.from_dict(dict(ld_centrality), orient='index')
    dataset['eigen_centrality'] = pd.DataFrame.from_dict(dict(eig_centrality), orient='index')

    return dataset

The following takes a long time, around 15 hours, and it greatly shows betweenness_centrality is a very slow process

In [None]:
train_set = graph_features(train_G, train_set)
train_set.betweeness_centrality.fillna(-1, inplace=True)
train_set.load_centrality.fillna(-1, inplace=True)
train_set.eigen_centrality.fillna(-1, inplace=True)

test_set = graph_features(train_G, test_set)
test_set.betweeness_centrality.fillna(-1, inplace=True)
test_set.load_centrality.fillna(-1, inplace=True)
test_set.eigen_centrality.fillna(-1, inplace=True)

# 再把nan去掉
train_set.publication_year_diff.fillna(-24, inplace=True) # 24 is for unknown (?)
train_set.fillna('unknown', inplace=True)
test_set.publication_year_diff.fillna(-24, inplace=True) # 24 is for unknown (?_)
test_set.fillna('unknown', inplace=True)

train_set.to_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/final_train_set_March_20.csv',index=False)
test_set.to_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/final_test_set_March_20.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


# Start Working

In [None]:
test_set = pd.read_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/final_train_set_March_20.csv')
train_set = pd.read_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/final_test_set_March_20.csv')

In [None]:
# 随意看一下
%matplotlib inline
## Most interesting correlation is with label
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14,12))
sns.heatmap(train_set.corr(),
            vmax=0.5,
            square=True,
            annot=True)

## Training split

In [None]:
# separate features from labels:
X = train_set.loc[:, (train_set.columns != 'label') & 
                     (train_set.columns != 'common_authors') & 
                     (train_set.columns != 'source_authors') & 
                     (train_set.columns != 'target_authors') & 
                     (train_set.columns != 'source_journal') & 
                     (train_set.columns != 'target_journal')]
y = train_set['label']
y.astype(np.int)

In [None]:
# final feature correlation
ff = X.copy()
ff['label'] = y
plt.figure(figsize=(14,12))
sns.heatmap(X.corr(),
            vmax=0.5,
            square=True,
            annot=True)

In [None]:
# describe results from scores
from scipy import stats 
stats.describe(scores['test_score'])

## XGB + parameter tuning

In [None]:
!pip install xgboost=0.6a2

### xgb base model

In [None]:
# making sure the test and the train files have same sequence of columns

test = test[X.columns]


# defining the base model
xgb_model_base = XGBClassifier(n_estimators = 100)

# printing the cross validation scores for the classifier
scores = cross_validate(xgb_model_base, X, y.values.ravel(), scoring='f1', 
                        cv=3,n_jobs = -1 ) # n_jobs is the number of cpus to use -1 => all
scores


# fitting on the training data
xgb_model_base.fit(X, y.values.ravel())

# predicting the outcome from the final 
predictions = xgb_model_base.predict(test)

# write out
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('submission.csv',index=False, sep=',')

In [None]:
# defining the search grid
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from xgboost.sklearn import XGBClassifier

xgb_regressor = xgb.XGBRegressor()

random_grid = {
     "n_estimators"     : [int(x) for x in np.linspace(50, 600, num = 20)],
     "learning_rate"    : [0.01, 0.02, 0.05, 0.10 ] ,
     "max_depth"        : [ 6, 8, 10, 12, 15, 20],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.3, 0.4, 0.7, 0.9 ],
     "colsample_bytree" : [ 0.05, 0.1, 0.3, 0.4]}

# Use the random grid to search for best hyperparameters

# First create the base model to tune
xgb_model = XGBClassifier()

# Random search of parameters
xgb_random = RandomizedSearchCV(estimator = xgb_model, param_distributions = random_grid,
n_iter = 10, cv = 3, verbose=2, random_state=42 ,n_jobs = -1, scoring = 'f1_weighted')

optimised_xgb_random = xgb_random.best_estimator_



# printing the cross validation scores for the classifier
scores = cross_validate(optimised_xgb_random, X, y.values.ravel(), scoring='f1', 
                        cv=3,n_jobs = -1 ) # n_jobs is the number of cpus to use -1 => all
scores


# fitting on the training data
xgb_model_base.fit(X, y.values.ravel())

# predicting the outcome from the final 
optimised_xgb_random.predict(test)

# write out
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/submission_XGB_March_20_1.csv', index=False, sep=',')

## SVM

In [None]:
from sklearn.svm import LinearSVC

# SVM has a zero tolerance towards null values, hence replacing them by 0

XVM = X.fillna(value=0)
test_SVM = test.fillna(value=0)

clf = LinearSVC( tol=1e-4)


# printing the cross validation scores for the classifier
scores = cross_validate(clf, XVM, y, scoring='f1', 
                        cv=10,n_jobs = -1 ) # n_jobs is the number of cpus to use -1 => all
scores


# fitting on the training data
clf.fit(XVM, y)

# predicting the outcome from the final 
prediction_clf = clf.predict(test_SVM)

# write out
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/submission_SVM_March_20_1.csv',index=False, sep=',')

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
# 1: retrain the complete model -> don't forget to change this to optimal one @ end
final_model = RandomForestClassifier()
final_model.fit(X, y)

In [None]:
# 2: predict on the test set
final_test_set = test_set.loc[:, (test_set.columns != 'source_authors') & 
                              (test_set.columns != 'common_authors') & 
                              (test_set.columns != 'target_authors') & 
                              (test_set.columns != 'label') & 
                              (test_set.columns != 'source_journal') & 
                              (test_set.columns != 'target_journal')]
predictions = final_model.predict(final_test_set)

# write out
out_df = test_set.copy()
data = {'id': list(out_df.index), 'category': predictions}
final_df = pd.DataFrame(data)


# 3: write file out
final_df.to_csv('/content/drive/MyDrive/DSBA M2/2 MLNS/Kaggle/my own work/submission_RF_March_20_1.csv',index=False, sep=',')

# plot the feature importance
feat_importances = pd.Series(final_model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()