In [None]:
import warnings
import csv
warnings.filterwarnings('ignore')
import numpy as np
import pickle
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler

# UTILITY FUNCTIONS

In [None]:
#Saving object
def save_obj(obj,name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

#Load saved object file
def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
# Accuracy Report
def check_results(predictions, classifications):
    print("Accuracy:")
    print(accuracy_score(classifications,predictions))
    print(classification_report(classifications,predictions))

# LOAD DATA

In [None]:
df_train_pos = load_obj('df_train_pos')
df_train_neg= load_obj('df_train_neg')
df_test = load_obj('df_test')
df_train_pos['label'] = 1
df_train_neg['label'] = 0
df_test['label'] =-1
print(df_train_pos.shape, df_train_neg.shape,df_test.shape)

# CALCULATE ADDITIONAL FEATURES

In [None]:
df_train_neg['aa'] = df_train_neg['aa'].replace(-123.0, .001)
df_train_pos['aa'] = df_train_pos['aa'].replace(-123.0, .001)
df_test['aa'] = df_test['aa'].replace(-123.0, .001)
df_train_pos = df_train_pos.drop(['source', 'sink'], axis = 1)
df_train_neg = df_train_neg.drop(['source', 'sink'], axis = 1)
df_train_pos = df_train_pos.drop(['index'], axis = 1)
df_train_neg = df_train_neg.drop(['index'], axis = 1)
df_test = df_test.drop(['source', 'sink'], axis = 1)
df_train_pos['label'] = 1
df_train_neg['label'] = 0
df_test['label'] = -1
df_train_pos.isnull().values.any(), df_train_neg.isnull().values.any(), df_test.isnull().values.any()

## ratios

In [None]:
df_train_pos['inDegreeRatio'] = df_train_pos.sink_inDegree/df_train_pos.source_inDegree
df_train_neg['inDegreeRatio'] = df_train_neg.sink_inDegree/df_train_neg.source_inDegree
df_test['inDegreeRatio'] = df_test.sink_inDegree/df_test.source_inDegree
df_train_pos.isnull().values.any(), df_train_neg.isnull().values.any(), df_test.isnull().values.any()

## Common neighbors

In [None]:
df_train_pos['common_neighbors'] = df_train_pos.common_followees + df_train_pos.common_followers
df_train_neg['common_neighbors'] = df_train_neg.common_followees + df_train_neg.common_followers
df_test['common_neighbors'] = df_test.common_followees + df_test.common_followers
df_train_pos.isnull().values.any(), df_train_neg.isnull().values.any(), df_test.isnull().values.any()

## Jaccard IN 

In [None]:
df_train_pos['jaccard_in'] = df_train_pos.common_followers/(df_train_pos.source_inDegree + df_train_pos.sink_inDegree)
df_train_neg['jaccard_in'] = df_train_neg.common_followers/(df_train_neg.source_inDegree + df_train_neg.sink_inDegree)
df_test['jaccard_in'] = df_test.common_followers/(df_test.source_inDegree + df_test.sink_inDegree)
df_train_pos.isnull().values.any(), df_train_neg.isnull().values.any(), df_test.isnull().values.any()

## Preferential Attachment

In [None]:
df_train_pos['preferential_attachment_out'] = df_train_pos.source_outDegree * df_train_pos.sink_inDegree
df_train_neg['preferential_attachment_out'] = df_train_neg.source_outDegree * df_train_neg.sink_inDegree
df_test['preferential_attachment_out'] = df_test.source_outDegree * df_test.sink_inDegree
df_train_pos.isnull().values.any(), df_train_neg.isnull().values.any(), df_test.isnull().values.any()

In [None]:
df_train_pos['preferential_attachment_in'] = df_train_pos.source_inDegree * df_train_pos.sink_outDegree
df_train_neg['preferential_attachment_in'] = df_train_neg.source_inDegree * df_train_neg.sink_outDegree
df_test['preferential_attachment_in'] = df_test.source_inDegree * df_test.sink_outDegree
df_train_pos.isnull().values.any(), df_train_neg.isnull().values.any(), df_test.isnull().values.any()

# MODELING

### Create train and test data

In [None]:
df_neg = df_train_neg
df_pos = df_train_pos
df_training = pd.concat([df_pos,df_neg])
print(df_training.shape)
df_training_x = df_training.drop(['label'], axis = 1)
df_training_y = df_training['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_training_x, df_training_y, test_size=0.0,random_state = 42)
droppingcolumns = []
X_train = X_train.drop(droppingcolumns, axis=1)
X_test = X_test.drop(droppingcolumns, axis=1)
df_training_x = df_training_x.drop(droppingcolumns, axis=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### SCALING

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)
droppingcolumnsFeatures = ['label']
droppingcolumnsFeatures = droppingcolumns + droppingcolumnsFeatures
test_data = df_test.drop(droppingcolumnsFeatures, axis=1)
test_data = scaler.transform(test_data)
print(test_data.shape)

### Try models

In [None]:
import itertools
tuples = [x for x in itertools.product((23,24,25,26),repeat=4)]
# tuples = [x for x in itertools.product((7,8,9),repeat=4)]
# tuples = [(17,),(17,16),(17,16,15),(17,16,15,14),(17,16,15,14,13),(17,16,15,14,13,12),
#           (17,16,15,14,13,12,11),(17,16,15,14,13,12,11,10),(17,16,15,14,13,12,11,10,9),
#           (17,16,15,14,13,12,11,10,9,8),(17,16,15,14,13,12,11,10,9,8,7),(17,16,15,14,13,12,11,10,9,8,7,6),
#          (17,16,15,14,13,12,11,10,9,8,7,6,5),(17,16,15,14,13,12,11,10,9,8,7,6,5,4),(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3),
#          (17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2),(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)]
print(len(tuples))
for tup in tuples:
    model = MLPClassifier(hidden_layer_sizes=tup,max_iter=1500, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    print(tup, ', Accuracy: {:.2f}'.format(model.score(X_test, y_test)), "AUC: ",
          roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),", Predicted:", model.predict(test_data).sum(),
         ", TP:",tp,", FP:",fp,", TN:",tn,", FN:",fn)

In [None]:
model = MLPClassifier(hidden_layer_sizes=(17), max_iter=1500, random_state=42)
# model = RandomForestClassifier(n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

In [None]:
# y_pred = model.predict(X_test)
# y_pred_proba = model.predict_proba(X_test)[:,1]
# print('Accuracy of classifier on test set: {:.6f}'.format(model.score(X_test, y_test)))
# print('AUC of classifier PROB on test set: {:.6f}'.format(roc_auc_score(y_test, model.predict_proba(X_test)[:,1])))
# print('AUC of classifier LABEL on test set: {:.6f}'.format(roc_auc_score(y_test, model.predict(X_test))))
# tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# print(" TP:",tp,", FP:",fp,", TN:",tn,", FN:",fn)
# false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
# print("AUC score of classifier on test set: ", auc(false_positive_rate, true_positive_rate))
# check_results(y_pred,y_test)
predictions = model.predict(test_data)
predictions_proba = model.predict_proba(test_data)[:,1]
print(predictions.sum(),predictions_proba.sum())

In [None]:
myFields = ['Id', 'Prediction']
with open("output.csv",'w', newline='') as resultFile:
    writer = csv.DictWriter(resultFile, fieldnames=myFields)
    writer.writeheader()
    for i in range(len(predictions_proba)):
        writer.writerow({'Id' : i+1, 'Prediction': predictions_proba[i]})
print("done")

### Feature selection 

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV


# Create the RFE object and compute a cross-validated score.
svc = LogisticRegression(C=15, max_iter = 500, tol = .0000001, random_state = 42)
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2),
              scoring='accuracy',verbose=2, n_jobs=-1)
rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
rfecv.ranking_

In [None]:
df_training.head()

In [None]:
from sklearn.feature_selection import mutual_info_classif
kepler_mutual_information = mutual_info_classif(kepler_X, kepler_y)

plt.subplots(1, figsize=(26, 1))
sns.heatmap(kepler_mutual_information[:, np.newaxis].T, cmap='Blues', cbar=False, linewidths=1, annot=True)
plt.yticks([], [])
plt.gca().set_xticklabels(kepler.columns[1:], rotation=45, ha='right', fontsize=12)
plt.suptitle("Kepler Variable Importance (mutual_info_classif)", fontsize=18, y=1.2)
plt.gcf().subplots_adjust(wspace=0.2)
pass

### Try different models

In [None]:
models = {}
models['Random Forest'] = RandomForestClassifier(n_jobs=-1, random_state=42)
models['Logistic Regression'] = LogisticRegression()
models['Multilayer Perceptron'] = MLPClassifier()
models['SVM'] = svm.SVC()
models['Gaussian NB'] = GaussianNB()
models['Adaboost Classifier'] = AdaBoostClassifier()
models['KNN'] = KNeighborsClassifier(3)
models['Gaussian Process Classifier'] = GaussianProcessClassifier(1.0 * RBF(1.0))
models['Decision Tree Classifier'] = DecisionTreeClassifier(max_depth=5)

### Grid Search 

In [None]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_depth' : [4,6,8],
    'criterion' :['gini', 'entropy']
}
clf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                   param_grid=param_grid, n_jobs=-1, verbose = 2)
best_model = clf.fit(X_train, y_train)
print("Best score: %0.4f" % clf.best_score_)
print(clf.best_params_)

### Plotting

In [None]:
roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
# df = pd.concat([df_train_pos_sample, df_train_neg_sample, df_test])
df = pd.concat([df_train_pos, df_train_neg, df_test])
# df = pd.concat([df_test, df_train_pos_sample])
# df = df[[ 'sink_inDegree', 'sink_outDegree', 'label']]
# df = df[['source_inDegree','source_outDegree', 'label']]
# df = df[['sink_pr', 'source_pr', 'label']]
df = df[['common_followers','common_followees', 'triadic_closure','followback', 'label']]
# df = df[['aa', 'outDegreeRatio','outInRatio', 'label']]
# df = df[['common_neighbors','jaccard_in', 'jaccard_out','preferential_attachment_out', 'preferential_attachment_in', 'label']]
sns.pairplot(df.astype(float), height=4.5, hue = 'label')