# Assignment 1

In [None]:
import pandas as pd
import networkx as nx
import random
import sklearn
from tqdm import tqdm
import sklearn.utils
import itertools
import numpy as np

from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import cross_val_score

## 1. Load Training Graph

In [None]:
#loading train data
def loadTrainData():
    filename = "train.txt"
    return [line.rstrip("\n") for line in open(filename)]

def loadTrainDataAsUndirectedGraph():
    filename = "train.txt"
    rows = [line.rstrip("\n") for line in open(filename)]
    g = nx.Graph()
    for row in rows:
        authorIds = row.split()
        for i, author in enumerate(authorIds):
            for coauthor in authorIds[i+1:]:
                if g.has_edge(author, coauthor):
                    g[author][coauthor]['frequency'] += 1
                else:
                    g.add_edge(author, coauthor, frequency=1)
    return g

## 2. Create Balanced Training Data

In [None]:
# this function attempt to create a balanced training data with false instances
def createBalancedTrainingData(graph, testDF):
    trueInstances = [[n1, n2, 1] for (n1, n2) in graph.edges if graph[n1][n2]['frequency'] > 1] 
    sourceSink = testDF["Source-Sink"].tolist()
    adj_G = nx.to_numpy_matrix(graph, nodelist = graph.nodes)
    # get unconnected node-pairs
    f1 = []
    f2 = []

    # traverse adjacency matrix
    l = int(0.5 * len(trueInstances))
    while len(f1) < l or len(f2) < l:
        #random.seed() # Removed seed
        i = random.sample(range(adj_G.shape[0]), 1)[0]
        j = random.sample(range(adj_G.shape[1]), 1)[0]
        try:
            # If shortest path > 4 then it goes into half of the false instances
            if 4 < nx.shortest_path_length(graph, str(i), str(j)):
                if len(f2) < l and (i,j) not in sourceSink:
                    #print(i, j)
                    f2.append([str(i), str(j), 0])
        except:
            # if there is no path between two nodes, then it goes into another half of the false instances.
            if len(f1) < l and (i,j) not in sourceSink:
                f1.append([str(i), str(j), 0])
        
    print('len(f1):',len(f1))
    print('len(f2):',len(f2))
    print('len(trueInstances):',len(trueInstances))
    # combine and shuffle them into dataframe
    data = sklearn.utils.shuffle(f1+f2+trueInstances)
    return pd.DataFrame(data,columns=['Source','Sink','Label'])

In [None]:
# this chunk is for preprocessing the network graph and gathering data to compute features
import community as community_louvain
trainGraph = loadTrainDataAsUndirectedGraph() # create network graph

# this returns a list of dictionaries containing subgraphs and nodes within each subgraph
components = list(nx.connected_components(trainGraph)) 
# this returns a list of nodes in network graph
nodes = list(trainGraph.nodes) 
component_dict = {}
for j in nodes:
    for idx, i in enumerate(components):
        tmp = list(i)
        #print(tmp)
        if str(j) in tmp:
            component_dict[j] = np.log(len(i))

# partition is used to calculate community-related features
partition = community_louvain.best_partition(trainGraph)
# betweenness and eigenvector centrality are two quantities which measure the importance of the nodes.
betweenness_dict = nx.betweenness_centrality(trainGraph)
eigenvector_dict = nx.eigenvector_centrality(trainGraph)

nx.set_node_attributes(trainGraph, component_dict, 'component')
nx.set_node_attributes(trainGraph, betweenness_dict, 'betweenness')
nx.set_node_attributes(trainGraph, eigenvector_dict, 'eigenvector')
nx.set_node_attributes(trainGraph, partition, 'community')

In [None]:
# read data into dataframe
testDF = pd.read_csv('test-public.csv', converters = {'Source': str, 'Sink': str})
testDF['Source-Sink'] = list(zip(testDF['Source'], testDF['Sink']))
# create training dataframe
trainDF = createBalancedTrainingData(trainGraph, testDF)

## 3. Generate Features

In [None]:
# functions generating features, details descriptions are in the reports.
# not all functions are used in the training, some of them are discarded.
def shortestDistance(graph, n1, n2):
    try: return 1/nx.shortest_path_length(graph, n1, n2)
    except: return 0
    
def commonNeighbours(graph, n1, n2):
    try: return len(list(nx.common_neighbors(graph, n1, n2)))
    except: return 0

def jaccard(graph, n1, n2):
    try: return list(nx.jaccard_coefficient(graph, [(n1, n2)]))[0][2]
    except: return 0
    
def adamicAdar(graph, n1, n2):
    try: return list(nx.adamic_adar_index(graph, [(n1, n2)]))[0][2]
    except: return 0

def preferentialAttachment(graph, n1, n2):
    try: return list(nx.preferential_attachment(graph, [(n1, n2)]))[0][2]
    except: return 0

def resourceAllocation(graph, n1, n2):
    try: return list(nx.resource_allocation_index(graph, [(n1, n2)]))[0][2]
    except: return 0
    
def localPath(graph, n1, n2):
    try:
        paths = list(nx.all_simple_paths(graph, source=n1, target=n2, cutoff=3))
        A2 = 0.0
        A3 = 0.0
        A1 = 0.0
        for path in paths:
            if len(path) == 3:
                A2 = A2 + 1.0
            elif len(path) == 4:
                A3 = A3 + 1.0
            elif len(path) == 2:
                A1 = A1 + 1.0
        return A1 + 0.1*A2 + 0.01*A3
    except:
        return 0

def Bet(graph, n1, n2):
    try:
        b1 = 1/graph.nodes[n1]['betweenness']
        b2 = 1/graph.nodes[n2]['betweenness']
    except:
        b1 = 0
        b2 = 0
    return max(b1, b2)

def Eig(graph, n1, n2):
    try:
        e1 = 1/graph.nodes[n1]['eigenvector']
        e2 = 1/graph.nodes[n2]['eigenvector']
    except:
        e1 = 0
        e2 = 0
    return max(e1, e2)

def com_ra(graph, n1, n2):
    try:
        return list(nx.ra_index_soundarajan_hopcroft(trainGraph, [(n1, n2)]))[0][2]
    except:
        return 0

def component(graph, n1, n2):
    try:
        e1 = graph.nodes[n1]['component']
        e2 = graph.nodes[n2]['component']
    except:
        e1 = 0
        e2 = 0
    return max(e1, e2)


In [None]:
def addFeaturesToDataframe(graph, data):
    #data['CommonNeighbours'] = data.apply(lambda l: commonNeighbours(graph, l.Source, l.Sink), axis=1)
    #print('Added "CommonNeighbours" column')
    #data['Jaccard'] = data.apply(lambda l: jaccard(graph, l.Source, l.Sink), axis=1)
    #print('Added "Jaccard" column')
    data['AdamicAdar'] = data.apply(lambda l: adamicAdar(graph, l.Source, l.Sink), axis=1)
    print('Added "AdamicAdar" column')
#     data['PreferentialAttachment'] = data.apply(lambda l: preferentialAttachment(graph, l.Source, l.Sink), axis=1)
#     print('Added "PreferentialAttachment" column')
#     data['ResourceAllocation'] = data.apply(lambda l: resourceAllocation(graph, l.Source, l.Sink), axis=1)
#     print('Added "ResourceAllocation" column')
#     data['Dist'] = data.apply(lambda l: shortestDistance(graph, l.Source, l.Sink),
#                               axis=1)  ## can't just leave as highest number cause will be detrimental when normalising
#     print('Added "Dist" column')

    # newly-added features
    # community common nodes
    data['CCN'] = data.apply(lambda l: com_ra(graph, l.Source, l.Sink), axis=1)
    print('Added "CCN" column')
    # betweenness centrality
    data['Betweeness'] = data.apply(lambda l: Bet(graph, l.Source, l.Sink), axis=1)
    print('Added "Betweeness" column')
    # eigenvector centrality
    data['Eigenvector'] = data.apply(lambda l: Eig(graph, l.Source, l.Sink), axis=1)
    print('Added "Eigenvector" column')
    # the size of the component nodes are in.
    data['Component'] = data.apply(lambda l: component(graph, l.Source, l.Sink), axis=1)
    print('Added "Component" column')
    # the number of paths of different lengths are considered as an accumulative weight
    data['LP'] = data.apply(lambda l: localPath(graph, l.Source, l.Sink), axis=1)
    print('Added "LP" column')


In [None]:
addFeaturesToDataframe(trainGraph, trainDF)
addFeaturesToDataframe(trainGraph, testDF)

In [None]:
trainDF.columns

In [None]:
FeatureColumns = [value for value in trainDF.columns if value not in ['Source', 'Sink', 'Label', 'RA', 'PA', 'JC', 'CN', 'Frequency', 'Component', 'CCN']] # 'SI', 'HPI', 'LP'

In [None]:
FeatureColumns

## 4. Visualisation

In [None]:
# Plot of feature distribution (comparing success vs failure scenarios)
import seaborn as sns
import matplotlib.pyplot as plt

sns.distributions.distplot(trainDF['Label']).set_title(f'Distribution of Label')
plt.show()

for name in ['CommonNeighbours', 'Jaccard', 'AdamicAdar','PreferentialAttachment', 'ResourceAllocation', 'Dist', 'SI', 'HPI', 'LP']:
    sns.distributions.distplot(trainDF[trainDF['Label']==0][name]).set_title(f'Distribution of {name} when label=0')
    plt.show()
    sns.distributions.distplot(trainDF[trainDF['Label']==1][name]).set_title(f'Distribution of {name} when label=1')
    plt.show()

In [None]:
import community as community_louvain
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import networkx as nx

partition = community_louvain.best_partition(trainGraph)
# draw network graph showing partitions as different subgraphs
pos = nx.spring_layout(trainGraph)
# color the nodes according to their partitions
cmap = cm.get_cmap('viridis', max(partition.values()) + 1)
nx.draw_networkx_nodes(trainGraph, pos, partition.keys(), node_size=40,
                       cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(trainGraph, pos, alpha=0.5)
plt.show()

## 4. Feature Scaling

In [None]:
# Scale the features
X_train = trainDF[FeatureColumns]
X_test = testDF[FeatureColumns]
y_train = trainDF['Label']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train.loc[:, FeatureColumns] = scaler.fit_transform(X_train)
X_test.loc[:, FeatureColumns] = scaler.transform(X_test)

In [None]:
X_train.describe()

In [None]:
X_test.head()

## 5. Feature Selection

In [None]:
# # ## Wrapper method
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression(class_weight="balanced")
# selector = RFE(lr, n_features_to_select=3, step=1)
# selector = selector.fit(X_train, y_train)
# allF = pd.DataFrame({'features': X_train.columns,'importance': selector.ranking_})
# importantFeatures = list(allF.query('importance==1')['features'])
# importantFeatures.sort()
# print('importantFeatures = ', importantFeatures)
# allF

In [None]:
## Filter method (LP, AdamicAdar)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
s = SelectKBest(f_classif, k=4)
s.fit(X_train, y_train)
allF = pd.DataFrame({'features': X_train.columns,'scores': s.scores_, 'pvalue':s.pvalues_}).sort_values(by=['scores'],ascending=False)
importantFeatures = list(allF['features'])[:7]
importantFeatures.sort()
print('importantFeatures = ', importantFeatures)
allF

## 6. Train

In [None]:
X_train = X_train[importantFeatures]
X_test = X_test[importantFeatures]
testIds = testDF['Id']

In [None]:
from sklearn.model_selection import cross_val_score

## Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier
neuralNetworkUndirected = MLPClassifier(hidden_layer_sizes=(3), momentum=True, max_iter=300)
NN_output = cross_val_score(neuralNetworkUndirected, X_train, y_train, scoring='roc_auc', cv=5)
print('Undirected Scores: ',NN_output)

In [None]:
neuralNetworkUndirected.fit(X_train, y_train)

In [None]:
NN_pred = neuralNetworkUndirected.predict_proba(X_test)
neuralNetworkResultUndirected =  pd.DataFrame({'Id': testIds,'Predicted': NN_pred[:,1]})

In [None]:
neuralNetworkResultUndirected.to_csv('results/neuralNetwork.csv', index=False)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.neural_network import MLPClassifier
# parameters = {
#     'solver': ['lbfgs'],
#     'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ],
#     'alpha': 10.0 ** -np.arange(1, 10),
#     'hidden_layer_sizes':np.arange(1, 10)
# }
# cv = GridSearchCV(MLPClassifier(), parameters, scoring='roc_auc')
# cv.fit(X_train, y_train)

# print("tuned hpyerparameters :(best parameters) ",cv.best_params_)
# print("accuracy :",cv.best_score_)

## SVM

In [None]:
from sklearn import svm
svmclf = svm.SVC(C = 1, kernel = 'rbf', gamma = 20, decision_function_shape = 'ovo', probability=True)
cross_val_score(svmclf, X_train, y_train, scoring='roc_auc', cv = 5)

In [None]:
svmclf.fit(X_train, y_train)

In [None]:
svm_pred = svmclf.predict_proba(X_test)
svmResult = pd.DataFrame({'Id': testIds,'Predicted': svm_pred[:,1]})

In [None]:
svmResult.to_csv('results/svm.csv', index=False)

In [None]:
svmclf.intercept_

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfclf = RandomForestClassifier()
cross_val_score(rfclf, X_train, y_train, scoring='roc_auc', cv = 5)

In [None]:
rfclf.fit(X_train, y_train)

In [None]:
rf_pred = rfclf.predict_proba(X_test)
rfResult = pd.DataFrame({'Id': testIds,'Predicted': rf_pred[:,1]})

In [None]:
rfResult.to_csv('results/rf.csv', index=False) # all zeros and ones

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtclf = DecisionTreeClassifier()
cross_val_score(dtclf, X_train, y_train, scoring='roc_auc', cv = 5)

In [None]:
dtclf.fit(X_train, y_train)

In [None]:
dt_pred = dtclf.predict_proba(X_test)
dtResult = pd.DataFrame({'Id': testIds,'Predicted': dt_pred[:,1]})

In [None]:
dtResult.to_csv('results/dt.csv', index=False) # all zeros and ones

# GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
nbclf = GaussianNB()
cross_val_score(nbclf, X_train, y_train, scoring='roc_auc', cv = 5)

In [None]:
nbclf.fit(X_train, y_train)

In [None]:
nb_pred = nbclf.predict_proba(X_test)
nbResult = pd.DataFrame({'Id': testIds,'Predicted': nb_pred[:,1]})

In [None]:
nbResult.to_csv('results/nb.csv', index=False) 

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
features = ['LP','Betweeness', 'Eigenvector', 'AdamicAdar']
#features = ['LP','AdamicAdar']
X_train1 = trainDF[features]
y_train = trainDF['Label']
X_test1 = testDF[features]
testIds = testDF['Id']
sc = StandardScaler()
X_train = sc.fit_transform(X_train1)
X_test = sc.transform(X_test1)
lr = LogisticRegression(class_weight="balanced")
print(cross_val_score(lr, X_train, y_train, scoring='roc_auc', cv=10))
lr.fit(X_train, y_train)
predictions = lr.predict_proba(X_test)

In [None]:
X_pred = list(trainDF['Label'])
y_pred = list(predictions[:, 1])
sns.distplot(X_pred)

In [None]:
sns.distplot(y_pred)

In [None]:
final_result = pd.DataFrame(data={'Id': testIds, 'Predicted': predictions[:,1]})
final_result.to_csv('results/LogisticRegression.csv', index=False)

In [None]:
l = pd.DataFrame({
    'features': importantFeatures,
    'importance': lr.coef_[0]
})
l.plot.bar(x='features', rot=90).set_title('Feature importance')

In [None]:
# Parameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
logreg_cv=GridSearchCV(
    LogisticRegression(class_weight="balanced"),
    {
        "C":np.logspace(-3,3,7),
        "penalty":["l1", "l2"],
        
    },
    cv=10,
    scoring='roc_auc'
)
logreg_cv.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

## XGBoost

In [None]:
import xgboost as xgb
dtrain=xgb.DMatrix(X_train,label=y_train)
dtest=xgb.DMatrix(X_test)
num_round=50
parameters={'max_depth':7, 'eta':1, 'silent':1,'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}
xg=xgb.train(parameters,dtrain,num_round)
xgbPredictions=xg.predict(dtest)
xgbResult = pd.DataFrame(data={'Id': testIds, 'Predicted': xgbPredictions})
xgbResult.to_csv('results/XGBoost.csv', index=False)

## LightGBM

In [None]:
import lightgbm as lgb
train_data=lgb.Dataset(X_train,label=y_train)
num_round=50
parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'num_threads' : 2,
    'learning_rate':0.05,        
    'num_leaves': 40,        
    'num_threads': 2,
    'seed': 90051
}
lgbm=lgb.train(parameters,train_data,num_round)
lgbmPredictions=lgbm.predict(X_test)
lgbmResult = pd.DataFrame(data={'Id': testIds, 'Predicted': lgbmPredictions})
lgbmResult.to_csv('results/lightGBM.csv', index=False)

In [None]:

print('cv results = ', lgb.cv(
        parameters,
        train_data,
        num_boost_round=10,
        nfold=5,
        metrics='auc',
        early_stopping_rounds=10,
        stratified=False
        ))


## Stacking

In [None]:
# logistic, svm, multi-layer perceptron
from sklearn.ensemble import StackingClassifier
estimators = [
    ('MLP', MLPClassifier(hidden_layer_sizes=(3), momentum=True, max_iter=300)),
    ('SVM', svm.SVC(C = 1, kernel = 'rbf', gamma = 20, decision_function_shape = 'ovo', probability=True)),
    ('LR', LogisticRegression(class_weight="balanced"))
]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv = 5)

In [None]:
clf.fit(X_train, y_train)
predictions = lr.predict_proba(X_test)

In [None]:
final_result = pd.DataFrame(data={'Id': testIds, 'Predicted': predictions[:,1]})

In [None]:
final_result.to_csv('results/StackedClassifiers11.csv', index=False)

In [None]:
print("""
(\_/)
( •o•)
/>  > 
"""
)