In [1]:
import pandas as pd
import numpy as np
import preprocess
from sklearn.model_selection import train_test_split
from iterative_classifier import IterativeClassifier
from sklearn.metrics import classification_report

### Extract numerical features from the csv file

In [2]:
df = pd.read_csv('../data/imputed_unified_node_data.csv').drop(columns=['confessed_assignments'])
names = df['name'].values.tolist()

### Generate edge weight list (avg/max/min)

In [3]:
avg_edge_weights = pd.read_csv('../data/avg_edge_weights.csv', keep_default_na=False).to_numpy()
max_edge_weights = pd.read_csv('../data/max_edge_weights.csv', keep_default_na=False).to_numpy()
min_edge_weights = pd.read_csv('../data/min_edge_weights.csv', keep_default_na=False).to_numpy()

# dict: 
# key - source
# value - [destination, avg, max, min]
edge_weight = {}
for i in range(len(avg_edge_weights)):
    source = avg_edge_weights[i][0]
    if not source in edge_weight:
        edge_weight[source] = []
    edge_weight[source].append([avg_edge_weights[i][1], avg_edge_weights[i][2], max_edge_weights[i][2], min_edge_weights[i][2]])

### Split the dataset into training and test set

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test = preprocess.stratified_train_val_test_split(df)
X_train = X_train.values.tolist()
y_train = y_train.values.tolist()
X_test = pd.concat([X_val, X_test]).values.tolist()
y_test = pd.concat([y_val, y_test]).values.tolist()

for i in range(len(y_train)):
    if y_train[i] > 0:
        y_train[i] = 1
for i in range(len(y_test)):
    if y_test[i] > 0:
        y_test[i] = 1

In [5]:
clf = IterativeClassifier(n_neighbors=9)
clf.fit(X_train, edge_weight, y_train)

no source jessica_mack
0 jessica_mack
0 jessica_mack
0 sharon_maddox
no neighbor carol_harris
no neighbor christopher_anderson
no neighbor douglas_luna
no neighbor sierra_montoya
no neighbor james_smith
0 willie_marshall
0 willie_marshall
0 deborah_baker
no source jose_gray
0 jose_gray
0 jose_gray
0 pamela_gibbs
no source karen_harris
0 karen_harris
0 karen_harris
no neighbor michael_price
no neighbor matthew_barnes
no neighbor brian_elliott
no neighbor brian_hamilton
0 harry_mcguire
0 harry_mcguire
no source rachel_jones
0 rachel_jones
0 rachel_jones
no neighbor jennifer_washington
no neighbor mrs._mary_hammond
no neighbor stacy_brown
0 tanya_walters
0 charles_compton
no source thomas_barnes
0 thomas_barnes
0 thomas_barnes
no neighbor destiny_martinez
no neighbor jeffery_clark
no neighbor mrs._mary_hammond
0 adrian_pineda
no neighbor dr._michael_garza_jr.
0 jessica_zuniga
0 jessica_zuniga
no neighbor laura_waller
no neighbor jose_kennedy
no neighbor mark_christian
no neighbor jennifer

no source wesley_baker
0 wesley_baker
0 wesley_baker
no neighbor christopher_marquez
no neighbor eugene_nelson
0 teresa_cardenas
0 teresa_cardenas
no neighbor debra_mendoza
0 jennifer_summers
no neighbor thomas_harris
no neighbor anne_thompson
no neighbor margaret_santiago
0 lynn_hall
no neighbor chelsea_jimenez
0 melissa_keller
no source amber_hernandez
0 amber_hernandez
0 amber_hernandez
no source jacob_torres
0 jacob_torres
0 jacob_torres
no neighbor carol_harris
0 bill_woodard
0 bill_woodard
no neighbor jeffery_clark
0 andrew_terry
0 john_stewart
no source angelica_cohen
0 angelica_cohen
0 angelica_cohen
no source victoria_williams
0 victoria_williams
0 victoria_williams
no neighbor kurt_lopez
no neighbor stephanie_martin
0 kimberly_russell
no neighbor jeremiah_castro
0 michael_romero
0 michael_romero
no neighbor destiny_martinez
0 stephen_townsend
0 bryce_cain
no neighbor kimberly_thomas
0 mary_dalton
no neighbor alexandria_jones
no neighbor jasmin_castro
no neighbor robert_myers


In [6]:
y_predict = clf.predict(X_test, edge_weight)
print(classification_report(y_test.values.tolist(), y_predict.tolist()))
        
results = []
names = []
for i in range(len(X_test)):
    names.append(X_test[i][0])
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_confess = y_predict[i]
    results.append([num_confessed_assignments, predict_confess])

df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess'])
df.to_csv('result.csv')

AttributeError: 'list' object has no attribute 'values'

In [None]:
y_predict = clf.predict_proba(X_test, edge_weight)
# print(classification_report(y_test, y_predict))
correct = 0
for i in range(len(y_predict)):
    if y_predict[i][0] > y_predict[i][1] and y_test[i]==0:
        correct += 1
    elif y_predict[i][0] < y_predict[i][1] and y_test[i]==1:
        correct += 1
        
results = []        
for i in range(len(X_test)):
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_prob_confess = y_predict[i][1]
    results.append([num_confessed_assignments, predict_prob_confess])
    
results = sorted(results, key=lambda x: x[1], reverse=True)
df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess_prob'])
df.to_csv('result_prob.csv')