In [1]:
import pandas as pd
import numpy as np
import preprocess
from sklearn.model_selection import train_test_split
from iterative_classifier import IterativeClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

### Extract numerical features from the csv file

In [2]:
df = pd.read_csv('../data/imputed_unified_node_data.csv').drop(columns=['confessed_assignments'])
names = df['name'].values.tolist()

### Generate edge weight list (avg/max/min)

In [3]:
avg_edge_weights = pd.read_csv('../data/avg_edge_weights.csv', keep_default_na=False).to_numpy()
max_edge_weights = pd.read_csv('../data/max_edge_weights.csv', keep_default_na=False).to_numpy()
min_edge_weights = pd.read_csv('../data/min_edge_weights.csv', keep_default_na=False).to_numpy()

# dict: 
# key - source
# value - [destination, avg, max, min]
edge_weight = {}
for i in range(len(avg_edge_weights)):
    source = avg_edge_weights[i][0]
    if not source in edge_weight:
        edge_weight[source] = []
    edge_weight[source].append([avg_edge_weights[i][1], avg_edge_weights[i][2], max_edge_weights[i][2], min_edge_weights[i][2]])

### Split the dataset into training and test set

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test = preprocess.stratified_train_val_test_split(df)
X_train = X_train.values.tolist()
y_train = y_train.values.tolist()
X_test = pd.concat([X_val, X_test]).values.tolist()
y_test = pd.concat([y_val, y_test]).values.tolist()

for i in range(len(y_train)):
    if y_train[i] > 0:
        y_train[i] = 1
for i in range(len(y_test)):
    if y_test[i] > 0:
        y_test[i] = 1

In [5]:
clf = IterativeClassifier(n_neighbors=9)
clf.fit(X_train, edge_weight, y_train)

no source jessica_mack
0 jessica_mack
0 jessica_mack
0 sharon_maddox
no neighbor carol_harris
no neighbor christopher_anderson
no neighbor douglas_luna
no neighbor sierra_montoya
no neighbor james_smith
0 willie_marshall
0 willie_marshall
0 deborah_baker
no source jose_gray
0 jose_gray
0 jose_gray
0 pamela_gibbs
no source karen_harris
0 karen_harris
0 karen_harris
no neighbor michael_price
no neighbor matthew_barnes
no neighbor brian_elliott
no neighbor brian_hamilton
0 harry_mcguire
0 harry_mcguire
no source rachel_jones
0 rachel_jones
0 rachel_jones
no neighbor jennifer_washington
no neighbor mrs._mary_hammond
no neighbor stacy_brown
0 tanya_walters
0 charles_compton
no source thomas_barnes
0 thomas_barnes
0 thomas_barnes
no neighbor destiny_martinez
no neighbor jeffery_clark
no neighbor mrs._mary_hammond
0 adrian_pineda
no neighbor dr._michael_garza_jr.
0 jessica_zuniga
0 jessica_zuniga
no neighbor laura_waller
no neighbor jose_kennedy
no neighbor mark_christian
no neighbor jennifer

no source wesley_baker
0 wesley_baker
0 wesley_baker
no neighbor christopher_marquez
no neighbor eugene_nelson
0 teresa_cardenas
0 teresa_cardenas
no neighbor debra_mendoza
0 jennifer_summers
no neighbor thomas_harris
no neighbor anne_thompson
no neighbor margaret_santiago
0 lynn_hall
no neighbor chelsea_jimenez
0 melissa_keller
no source amber_hernandez
0 amber_hernandez
0 amber_hernandez
no source jacob_torres
0 jacob_torres
0 jacob_torres
no neighbor carol_harris
0 bill_woodard
0 bill_woodard
no neighbor jeffery_clark
0 andrew_terry
0 john_stewart
no source angelica_cohen
0 angelica_cohen
0 angelica_cohen
no source victoria_williams
0 victoria_williams
0 victoria_williams
no neighbor kurt_lopez
no neighbor stephanie_martin
0 kimberly_russell
no neighbor jeremiah_castro
0 michael_romero
0 michael_romero
no neighbor destiny_martinez
0 stephen_townsend
0 bryce_cain
no neighbor kimberly_thomas
0 mary_dalton
no neighbor alexandria_jones
no neighbor jasmin_castro
no neighbor robert_myers


In [6]:
y_predict = clf.predict(X_test, edge_weight)
print(classification_report(y_test, y_predict))
        
results = []
names = []
for i in range(len(X_test)):
    names.append(X_test[i][0])
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_confess = y_predict[i]
    results.append([num_confessed_assignments, predict_confess])

df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess'])
df.to_csv('result.csv')

no neighbor phillip_hicks
0 jennifer_singh
0 jennifer_singh
no neighbor kevin_kelly
no neighbor mason_deleon
0 michael_price
no neighbor bradley_rush
no neighbor franklin_carlson
no neighbor michael_marsh
no neighbor elizabeth_vaughn
0 brandy_garner
0 brandy_garner
no source susan_barrett
0 susan_barrett
0 susan_barrett
no neighbor nicole_finley
0 james_carter
0 megan_coleman
no neighbor mary_parker
no neighbor tim_hoffman
no neighbor edward_collier
no neighbor shannon_ford
no neighbor leslie_moreno
no neighbor vanessa_camacho
no neighbor traci_rowland
no neighbor luis_armstrong
no neighbor christopher_gilmore
no neighbor paula_guerrero
no neighbor tiffany_perry
no neighbor jesse_perry
no neighbor sonya_bowers
no neighbor laura_salazar
0 carlos_terry
no neighbor dale_fox
0 michael_fitzgerald
no neighbor barbara_nunez
no neighbor jasmine_smith
0 amanda_spears
0 amanda_spears
no neighbor robert_barton
0 cassandra_hernandez
no neighbor deanna_perry
0 destiny_martinez
0 destiny_martinez
no

0 jessica_coleman
no neighbor richard_mendoza
no neighbor kevin_wade
0 jessica_bryant
0 jessica_bryant
no neighbor darrell_roberson
no neighbor angelica_cohen
0 gina_roberts
0 gina_roberts
no neighbor donald_williams
no neighbor kristina_wilcox
no neighbor mrs._mary_johnson
no neighbor darrell_roberson
0 mrs._mary_hammond
no neighbor jacqueline_middleton
no neighbor samantha_wang
0 nancy_sanchez
no neighbor stephanie_mosley
no neighbor andrea_robinson
no neighbor kenneth_williams
no neighbor eugene_ibarra
no neighbor ryan_gross
no neighbor rebecca_russo
no neighbor tina_spears
0 justin_burton
no neighbor ivan_kramer
0 jon_taylor
0 jon_taylor
no neighbor jennifer_obrien
no neighbor john_cuevas_md
no neighbor jessica_smith
0 gregory_foster
0 gregory_foster
no neighbor jordan_alexander
0 samuel_cooley
0 samuel_cooley
no neighbor ashley_rogers
no neighbor deanna_perry
no neighbor lori_bennett
no neighbor kristina_wilcox
no neighbor leslie_moreno
no neighbor carol_garrett
no neighbor britta

no neighbor amber_hernandez
no neighbor thomas_martinez
0 lee_bass
0 lee_bass
no source tony_cook
0 tony_cook
0 tony_cook
no neighbor kevin_kelly
no neighbor amanda_white
no neighbor ruben_mccoy_phd
0 miguel_reynolds
no neighbor jennifer_miller
no neighbor pamela_gibbs
no neighbor danielle_marsh
0 norman_gonzalez
no source edwin_williams
0 edwin_williams
0 edwin_williams
no source alexis_phillips
0 alexis_phillips
0 alexis_phillips
no neighbor timothy_tucker
0 michael_alexander
no source angela_marshall
0 angela_marshall
0 angela_marshall
no neighbor rebecca_russo
0 diana_alexander
0 diana_alexander
no neighbor jessica_wade
0 sarah_stewart
0 sarah_stewart
no neighbor vanessa_camacho
no neighbor edward_collier
no neighbor laura_salazar
0 walter_johnson
no neighbor robin_coleman
no neighbor hailey_solomon
0 april_turner
no source gary_evans
0 gary_evans
0 gary_evans
no neighbor karen_harris
0 james_romero
0 james_romero
no neighbor tina_spears
no neighbor martha_ramirez
0 karen_walker
no

              precision    recall  f1-score   support

           0       0.86      1.00      0.92       264
           1       0.00      0.00      0.00        44

    accuracy                           0.85       308
   macro avg       0.43      0.50      0.46       308
weighted avg       0.73      0.85      0.79       308



In [7]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92       264
           1       0.00      0.00      0.00        44

    accuracy                           0.85       308
   macro avg       0.43      0.50      0.46       308
weighted avg       0.73      0.85      0.79       308



In [8]:
print(confusion_matrix(y_test, y_predict))

[[263   1]
 [ 44   0]]


In [9]:
y_predict = clf.predict_proba(X_test, edge_weight)
# print(classification_report(y_test, y_predict))
correct = 0
for i in range(len(y_predict)):
    if y_predict[i][0] > y_predict[i][1] and y_test[i]==0:
        correct += 1
    elif y_predict[i][0] < y_predict[i][1] and y_test[i]==1:
        correct += 1
        
results = []        
for i in range(len(X_test)):
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_prob_confess = y_predict[i][1]
    results.append([num_confessed_assignments, predict_prob_confess])
    
results = sorted(results, key=lambda x: x[1], reverse=True)
df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess_prob'])
df.to_csv('result_prob.csv')

no neighbor phillip_hicks
0 jennifer_singh
0 jennifer_singh
no neighbor brian_elliott
no neighbor kevin_kelly
no neighbor alexandra_chavez
no neighbor mason_deleon
0 michael_price
0 michael_price
no neighbor bradley_rush
no neighbor franklin_carlson
no neighbor michael_marsh
no neighbor elizabeth_vaughn
0 brandy_garner
0 brandy_garner
no source susan_barrett
0 susan_barrett
0 susan_barrett
no neighbor shari_butler
no neighbor nicole_finley
0 james_carter
0 james_carter
no neighbor stacy_dixon
no neighbor trevor_ford
no neighbor jerry_garcia
0 megan_coleman
0 megan_coleman
no neighbor anne_thompson
no neighbor mary_parker
no neighbor tim_hoffman
no neighbor edward_collier
no neighbor david_washington
no neighbor shannon_ford
no neighbor leslie_moreno
no neighbor vanessa_camacho
no neighbor traci_rowland
no neighbor luis_armstrong
no neighbor christopher_gilmore
no neighbor paula_guerrero
no neighbor walter_johnson
no neighbor tiffany_perry
no neighbor jesse_perry
no neighbor sonya_bower

0 margaret_bailey
no neighbor christopher_marquez
no neighbor margaret_santiago
no neighbor eugene_ibarra
no neighbor rebecca_russo
no neighbor justin_burton
no neighbor michael_marsh
no neighbor michael_stewart
0 thomas_harris
0 thomas_harris
no neighbor debbie_fernandez
no neighbor elizabeth_smith
no neighbor william_moreno
no neighbor keith_morales
no neighbor christopher_medina
no neighbor robin_coleman
no neighbor edward_gonzales
no neighbor michael_solomon
0 connie_fisher
0 connie_fisher
no neighbor matthew_rojas
no neighbor michael_stewart
no neighbor janet_foster
no neighbor colleen_mcmillan
0 christopher_marquez
0 christopher_marquez
no neighbor andrew_black
no neighbor harry_mcguire
no neighbor erica_thomas
no neighbor jennifer_cox
no neighbor brandi_meyer
0 benjamin_jones
0 benjamin_jones
no neighbor felicia_walker
no neighbor gary_barrett
no neighbor jimmy_moore
no neighbor julie_mendoza
no neighbor elizabeth_vaughn
no neighbor jake_garcia
no neighbor jessica_coleman
no nei