## Load necessary lib

In [1]:
import pandas as pd 
import numpy as np 
import itertools
import networkx as nx
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix,accuracy_score
import statsmodels.api as sm

## Define function

In [2]:
#loading train data
def loadTrainData():
    filename = "train.txt"
    return [line.rstrip("\n") for line in open(filename)]

# this is your graph function, mainly use this data structure to generate psuedo false example
def createUndirectedGraph(dataRow):
    g = nx.Graph()
    for row in dataRow:
        authorIds = row.split()
        for i, author in enumerate(authorIds):
            for coauthor in authorIds[i+1:]:
                if g.has_edge(author, coauthor):
                    g[author][coauthor]['frequency'] += 1 # TODO: train this (e.g. noise in the training data)
                else:
                    g.add_edge(author, coauthor, frequency=1)
    return g

# preprocessing train data
def pre_row(trainRow):
    txt = [list(map(int, i.split())) for i in trainRow]
    trainGraph = createUndirectedGraph(trainRow)
    g1 = {}
    converted_txt = []
    tmp = []
    for link in txt:
        for subset in itertools.permutations(link, 2):
            tmp.append(subset[0])
            tmp.append(subset[1])
            converted_txt.append(subset)
    train1 = pd.DataFrame(converted_txt, columns=["srce", "dest"])
    freq = train1.groupby(["srce", "dest"]).size().values
    #train1['freq'] = freq
    txt_1 = sorted(set(converted_txt))
    df1 = pd.DataFrame(txt_1, columns=["srce", "dest"])
    #df1['freq'] = freq
    df1['score'] = 1
    for points, f in zip(txt_1, freq):
        g1[points[0]] = g1.setdefault(points[0], [])
        g1[points[0]].append((points[1], f))
    V = list(set(tmp))
    return g1, V, df1, txt_1

#functions that generating features 
def fu(g1, u):
    return 1

def getNodeScore(g1, Nu):
    AA = 0
    RA = 0
    CCN = len(Nu)
    CRA = 0
    if len(Nu) == 0:
        return 0, 0, 0, 0
    for u in Nu:
        AA += 1/np.log(len(g1[u]))
        RA += 1/len(g1[u])
        CCN += fu(g1, u)
        CRA += fu(g1, u)/len(g1[u])
    return AA, RA, CCN, CRA

def Pxy(g1, x, y):
    try:
        Gx = g1[x]
        Gy = g1[y]
    except:
        return 0, 0, 0, 0, 0, 0, 0, 0
    Nx = [i[0] for i in Gx]
    Ny = [i[0] for i in Gy]
    NxINy = list(set(Nx) & set(Ny))
    NxUNy = list(set(Nx + Ny))
    AA, RA, CCN, CRA = getNodeScore(g1, NxINy)
    cardNx = len(Nx)
    cardNy = len(Ny)
    PA = cardNx * cardNy
    JC = len(NxINy)/len(NxUNy)
    HPI = len(NxINy)/min(cardNx, cardNy)
    HDI = len(NxINy)/max(cardNx, cardNy)
    return AA, len(NxINy), len(NxINy)/np.sqrt(cardNx*cardNy), JC, HPI, HDI, PA, RA

In [3]:
#loading data 
test = pd.read_csv("test-public.csv")
train1 = loadTrainData()

# an attempt to split the data into train and dev set
random.shuffle(train1)
m = int(len(train1)*0.9)
trainRow = train1[0:m]
devRow = train1[m:]

In [4]:
#only use train set to construct the model
g1, V, df1, txt_1 = pre_row(trainRow)

In [5]:
#this structure use the whole dataset in order to evaluate the model
trainGraph  = createUndirectedGraph(train1)

In [6]:
#generate psudo-false-example
psuedo = []
i = 0
while len(psuedo) < len(txt_1):
    random.seed(i)
    nodes = random.sample(V, 2)
    n1, n2 = nodes[0], nodes[1]
    try:
        path = nx.dijkstra_path(trainGraph, source = n1, target = n2)
    except:
        if [n1, n2, 0] not in psuedo:
            psuedo.append([n1, n2, 0])
    i += 1

In [7]:
#add psudo example into train data set
df2 = pd.DataFrame(psuedo, columns = ['srce', 'dest', 'score'])
df3 = df1.append(df2)

In [8]:
#generate features
AAs, CNs, SIs, JCs, HPIs, HDIs, PAs, RAs = [],[],[],[],[],[],[],[]
for idx, i in df3.iterrows():
    n1 = i[0]
    n2 = i[1]
    AA, CN, SI, JC, HPI, HDI, PA, RA = Pxy(g1, n1, n2)
    AAs.append(AA)
    CNs.append(CN)
    SIs.append(SI)
    JCs.append(JC)
    HPIs.append(HPI)
    HDIs.append(HDI)
    PAs.append(PA)
    RAs.append(RA)
    
df3['AA'] = AAs
df3['CN'] = CNs
df3['SI'] = SIs
df3['JC'] = JCs
df3['HPI'] = HPIs
df3['HDI'] = HDIs
df3['PA'] = PAs
df3['RA'] = RAs

In [9]:
# train_test_split
X = df3[['AA', 'CN', 'SI', 'JC', 'HPI', 'HDI', 'PA', 'RA']]
y = df3['score']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

## model1

In [10]:
# model1 GaussianNB, not good, currently discard
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train1 = sc.fit_transform(X_train)
X_test1 = sc.transform(X_test)

In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score
classifier = GaussianNB()
classifier.fit(X_train1, y_train)
y_pred = classifier.predict(X_test1)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test,y_pred)
print(acc)
print(cm)

0.9424924332148967
[[7560  117]
 [ 757 6764]]


## model2

In [12]:
# model2, logistic regression, not good, currently discard
features = ['AA', 'CN', 'SI', 'JC', 'HPI', 'HDI', 'PA', 'RA']
X_train2 = X_train[features]
X_test2 = X_test[features]
X_test2['intercept'] = 1.0
X_train2['intercept'] = 1.0
logit = sm.Logit(y_train, X_train2)
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.176453
         Iterations 12


In [13]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                  score   No. Observations:                45594
Model:                          Logit   Df Residuals:                    45585
Method:                           MLE   Df Model:                            8
Date:                Fri, 09 Apr 2021   Pseudo R-squ.:                  0.7454
Time:                        22:13:46   Log-Likelihood:                -8045.2
converged:                       True   LL-Null:                       -31603.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
AA            25.4031      2.956      8.593      0.000      19.609      31.197
CN            -5.3453      0.596     -8.964      0.000      -6.514      -4.177
SI            45.0351      3.323     13.552      0.0

In [14]:
# predicting probability and use threshold of 0.5 to determine labels
tmp = X_test2.dot(result.params)
y_pred = 1/(1+np.exp(-tmp))
y_pred1 = []
for i in y_pred:
    if i > 0.5:
        y_pred1.append(1)
    else:
        y_pred1.append(0)

In [15]:
cm = confusion_matrix(y_test, y_pred1)
ac = accuracy_score(y_test,y_pred1)
print(ac)
print(cm)

0.9496644295302014
[[7544  133]
 [ 632 6889]]


In [16]:
#preparing data structure for AUC and precision evaulation for logistic regression
X_psuedo = []
for i in psuedo:
    n1 = i[0]
    n2 = i[1]
    AA, CN, SI, JC, HPI, HDI, PA, RA = Pxy(g1, n1, n2)
    X_psuedo.append([AA, CN, SI, JC, HPI, HDI, PA, RA, 1])
new = np.array(X_psuedo)
tmp1 = new.dot(result.params)
y_psuedo = 1/(1+np.exp(-tmp1))

In [17]:
X_dev = []
g_dev, V_dev, df_dev, txt_dev = pre_row(devRow) 
for i in txt_dev:
    n1 = i[0]
    n2 = i[1]
    AA, CN, SI, JC, HPI, HDI, PA, RA = Pxy(g1, n1, n2)
    X_dev.append([AA, CN, SI, JC, HPI, HDI, PA, RA, 1])
new = np.array(X_dev)
tmp1 = new.dot(result.params)
y_dev = 1/(1+np.exp(-tmp1))

In [18]:
# evalution function implemented, mentioned in paper https://arxiv.org/pdf/1010.0725.pdf
def AUC(R, M):
    nume = 0
    deno = 0
    idx = 0
    for i in R:
        deno += len(M)
        nume += sum(i > j for j in M)
        nume += 0.5*list(M).count(i)
        if idx%100 == 0:
            pass
            #print(nume, deno, nume/deno)
        idx += 1
    return nume/deno

def precision(R, M, thres):
    s1 = sum(i > thres for i in R)
    s2 = sum(j > thres for j in M)
    return (s1)/(s1+s2)

In [19]:
# randomly pick 1000 instances to test, otherwise it takes too long to calculate
n_dev = random.sample(list(y_dev), 1000)
n_psuedo = random.sample(list(y_psuedo), 1000)
print(AUC(n_dev, n_psuedo))
print(precision(y_dev, y_psuedo, 0.5))

0.824787
0.8455821635012386


In [20]:
# try to draw something
import matplotlib.pyplot as plt
thres = np.array(list(range(0,11)))/10
prec_list = []
for i in thres:
    prec_list.append(precision(y_dev, y_psuedo, i))
plt.plot(thres, prec_list)
plt.xlabel('threshold')
plt.ylabel('precision')
plt.show()



<Figure size 640x480 with 1 Axes>

In [21]:
# trying to predict test and output to csv file, currently discard
pre = []
for idx, i in test.iterrows():
    n1 = i[1]
    n2 = i[2]
    AA, CN, SI, JC, HPI, HDI, PA, RA  = Pxy(g1, n1, n2)
    pre.append([AA, CN, SI, JC, HPI, PA, 1])
new = np.array(pre)
tmp1 = new.dot(result.params)
y_pred = 1/(1+np.exp(-tmp1))
test['Predicted'] = y_pred
test[['Id', 'Predicted']].to_csv('results/EdgeAddClassifier.csv', index=False)

ValueError: shapes (2000,7) and (9,) not aligned: 7 (dim 1) != 9 (dim 0)

In [22]:
# k-fold attempt, not really implemented here, whole loop is in ass1.py
kf = KFold(n_splits=5,shuffle=False)
traindf = np.array(trainRow)
for train_index, dev_index in kf.split(trainRow):
    X_train5, X_dev5 = traindf[train_index], traindf[dev_index]

In [23]:
g_dev1, V_dev1, df_dev1, txt_dev1 = pre_row(X_train5)

## Model 3

In [24]:
# svm model
from sklearn import svm
clf1 = svm.SVC(C = 1, kernel = 'rbf', gamma = 20, decision_function_shape = 'ovo')
clf1.fit(X_train2, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma=20, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [25]:
print(clf1.score(X_train2, y_train))


0.9586568408123876


In [26]:
print(clf1.score(X_test2, y_test))

0.9480852743782077


In [27]:
# predicting dev set
X_dev = []
g_dev, V_dev, df_dev, txt_dev = pre_row(devRow) 
for i in txt_dev:
    n1 = i[0]
    n2 = i[1]
    AA, CN, SI, JC, HPI, HDI, PA, RA  = Pxy(g1, n1, n2)
    X_dev.append([AA, CN, SI, JC, HPI, HDI, PA, RA, 1])
y_dev = clf1.predict(X_dev)

In [28]:
clf1.score(X_dev, y_dev)

1.0

In [29]:
# predicting test set
pre = []
for idx, i in test.iterrows():
    n1 = i[1]
    n2 = i[2]
    AA, CN, SI, JC, HPI, HDI, PA, RA  = Pxy(g1, n1, n2)
    pre.append([AA, CN, SI, JC, HPI, HDI, PA, RA, 1])
y_pred = clf1.predict(pre)

In [30]:
# predicting psuedo set
X_psuedo = []
for i in psuedo:
    n1 = i[0]
    n2 = i[1]
    AA, CN, SI, JC, HPI, HDI, PA, RA = Pxy(g1, n1, n2)
    X_psuedo.append([AA, CN, SI, JC, HPI, HDI, PA, RA, 1])


In [31]:
# generating probability for each instance in dev set and test set
decision_f = clf1.decision_function(pre)
decision_f1 = clf1.decision_function(X_dev)

In [32]:
# generate probability for each instance in psuedo set
decision_f2 = clf1.decision_function(X_psuedo)
# normalize probability
map_psuedo = []
max_d = max(decision_f2)
min_d = min(decision_f2)
for i in decision_f2:
    map_psuedo.append((i-min_d)/(max_d - min_d))

In [33]:
# normalize prob.
map_dev = []
max_d = max(decision_f1)
min_d = min(decision_f1)
for i in decision_f1:
    map_dev.append((i-min_d)/(max_d - min_d))

In [34]:
# normalize prob.
map_pred = []
max_d = max(decision_f)
min_d = min(decision_f)
for i in decision_f:
    map_pred.append((i-min_d)/(max_d - min_d))

In [35]:
# evaluate
n_dev = random.sample(list(map_dev), 1000)
n_psuedo = random.sample(list(map_psuedo), 1000)
print(AUC(n_dev, n_psuedo))
print(precision(map_dev, map_psuedo, 0.5))

0.813581
0.8536863729742068


In [81]:
# this output get kaggle score above 0.8
test['Predicted'] = map_pred
test[['Id', 'Predicted']].to_csv('results/EdgeAddClassifier.csv', index=False)