In [1]:
import pandas as pd
import numpy as np
import preprocess
from sklearn.model_selection import train_test_split
from iterative_classifier import IterativeClassifier
from sklearn.metrics import classification_report, confusion_matrix

### Extract numerical features from the csv file

In [2]:
df = pd.read_csv('../data/imputed_unified_node_data.csv').drop(columns=['confessed_assignments'])
names = df['name'].values.tolist()

temp = df.values.tolist()
for i in range(len(temp)):
    if temp[i][21] > 0:
        temp[i][21] = 1
df = pd.DataFrame(temp, columns=df.columns)

### Generate edge weight list (avg/max/min)

In [3]:
avg_edge_weights = pd.read_csv('../data/avg_edge_weights.csv', keep_default_na=False).to_numpy()
max_edge_weights = pd.read_csv('../data/max_edge_weights.csv', keep_default_na=False).to_numpy()
min_edge_weights = pd.read_csv('../data/min_edge_weights.csv', keep_default_na=False).to_numpy()

# dict: 
# key - name
# value - [destination, avg, max, min]
edge_weight = {}
for i in range(len(avg_edge_weights)):
    source = avg_edge_weights[i][0]
    destination = avg_edge_weights[i][1]
    if not source in edge_weight:
        edge_weight[source] = []
    if not destination in edge_weight:
        edge_weight[destination] = []
    edge_weight[source].append([destination, avg_edge_weights[i][2], max_edge_weights[i][2], min_edge_weights[i][2]])
    edge_weight[destination].append([source, avg_edge_weights[i][2], max_edge_weights[i][2], min_edge_weights[i][2]])

### Split the dataset into training and test set

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test = preprocess.stratified_train_val_test_split(df)

train_df = pd.concat([X_train, y_train], axis='columns')
train_df = preprocess.downsample(train_df, 'num_confessed_assignments', random_state=0)

X_train = train_df.iloc[:, 0:train_df.shape[1]-1].values.tolist()
y_train = train_df.iloc[:, train_df.shape[1]-1:].values.flatten().tolist()

X_test = pd.concat([X_val, X_test]).values.tolist()
y_test = pd.concat([y_val, y_test]).values.tolist()

In [None]:
clf = IterativeClassifier(n_neighbors=9)
clf.fit(X_train, edge_weight, y_train)

In [None]:
y_predict = clf.predict(X_test, edge_weight, max_iter=1000)
print(y_predict)
results = []
names = []
for i in range(len(X_test)):
    names.append(X_test[i][0])
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_confess = y_predict[i]
    results.append([num_confessed_assignments, predict_confess])

df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess'])
df.to_csv('result.csv')

In [7]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.86      0.19      0.31       264
           1       0.14      0.82      0.24        44

    accuracy                           0.28       308
   macro avg       0.50      0.50      0.27       308
weighted avg       0.76      0.28      0.30       308



In [8]:
print(confusion_matrix(y_test, y_predict))

[[ 49 215]
 [  8  36]]


In [None]:
y_predict = clf.predict_proba(X_test, edge_weight, max_iter=10000)
correct = 0
for i in range(len(y_predict)):
    if y_predict[i][0] > y_predict[i][1] and y_test[i]==0:
        correct += 1
    elif y_predict[i][0] < y_predict[i][1] and y_test[i]==1:
        correct += 1
        
results = []        
for i in range(len(X_test)):
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_prob_confess = y_predict[i][1]
    results.append([num_confessed_assignments, predict_prob_confess])
    
results = sorted(results, key=lambda x: x[1], reverse=True)
df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess_prob'])
df.to_csv('result_prob.csv')