In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from iterative_classifier import IterativeClassifier
from sklearn.metrics import classification_report

### Extract numerical features from the csv file

In [2]:
df = pd.read_csv('../data/imputed_unified_node_data.csv').drop(columns=['Unnamed: 0', 'confessed_assignments'])
names = df['name'].values.tolist()

### Generate edge weight list (avg/max/min)

In [3]:
avg_edge_weights = pd.read_csv('../data/avg_edge_weights.csv', keep_default_na=False).to_numpy()
max_edge_weights = pd.read_csv('../data/max_edge_weights.csv', keep_default_na=False).to_numpy()
min_edge_weights = pd.read_csv('../data/min_edge_weights.csv', keep_default_na=False).to_numpy()

# dict: 
# key - source
# value - [destination, avg, max, min]
edge_weight = {}
for i in range(len(avg_edge_weights)):
    source = avg_edge_weights[i][0]
    if not source in edge_weight:
        edge_weight[source] = []
    edge_weight[source].append([avg_edge_weights[i][1], avg_edge_weights[i][2], max_edge_weights[i][2], min_edge_weights[i][2]])

### Split the dataset into training and test set

In [8]:
X = df['num_confessed_assignments'].values.tolist()
y = []
for i in range(len(X)):
    x = X[i]
    if x > 0:
        y.append(1)
    else:
        y.append(0)
X = df.values.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [5]:
clf = IterativeClassifier(n_neighbors=9)
clf.fit(X_train, edge_weight, y_train)

no neighbor erika_hill
no neighbor hannah_miller
no neighbor victoria_hubbard
0 nicholas_bates
no neighbor angelica_cohen
no neighbor jamie_bennett
0 marcus_moore
0 mackenzie_chapman
no neighbor brian_elliott
no neighbor kayla_nolan
no neighbor patricia_reeves
no neighbor sherry_cardenas
0 kimberly_smith
no neighbor kristi_stevens
no neighbor donald_williams
no neighbor kristina_wilcox
no neighbor rachel_branch
0 jackie_martin_dvm
no neighbor larry_villarreal
no neighbor stacy_brown
0 theresa_smith
no neighbor megan_lopez
no neighbor mr._seth_cook
0 makayla_mitchell
no source jennifer_nguyen
0 jennifer_nguyen
0 jennifer_nguyen
0 tyler_hart
no neighbor veronica_collier
0 michelle_stephens
no neighbor brian_elliott
0 keith_morales
no neighbor paula_thompson
0 tonya_ellison
0 tonya_ellison
0 kelsey_brewer
no neighbor pamela_howard
0 edward_hall
0 edward_hall
0 ivan_kramer
no neighbor megan_lopez
0 stephanie_johnson
no neighbor kristina_wilcox
0 brandy_sims
0 brandy_sims
no neighbor brian_

In [6]:
y_predict = clf.predict(X_test, edge_weight)
print(classification_report(y_test, y_predict))
        
results = []
names = []
for i in range(len(X_test)):
    names.append(X_test[i][0])
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_confess = y_predict[i]
    results.append([num_confessed_assignments, predict_confess])

df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess'])
df.to_csv('result.csv')

no source alexa_pruitt
0 alexa_pruitt
0 alexa_pruitt
no neighbor robin_coleman
no neighbor jennifer_cox
no neighbor alexander_wells
no neighbor jennifer_lynch
no neighbor ryan_gross
no neighbor natalie_young
no neighbor john_rowe
no neighbor curtis_richmond
0 julie_gibson
0 julie_gibson
no neighbor jeremiah_castro
0 michael_romero
0 michael_romero
no neighbor gregory_foster
no neighbor michael_joseph
0 brandon_meyer
0 brandon_meyer
no neighbor anne_thompson
no neighbor edward_collier
no neighbor laura_salazar
no neighbor shannon_ford
no neighbor christopher_gilmore
no neighbor vanessa_camacho
no neighbor sonya_bowers
0 tim_hoffman
no neighbor carolyn_campbell
no neighbor beverly_gay
no neighbor james_webb
0 carolyn_barnes
0 carolyn_barnes
no neighbor darrell_roberson
0 gina_roberts
no neighbor george_wade
0 lisa_hays
0 lisa_hays
no source robert_steele
0 robert_steele
0 robert_steele
no neighbor jason_thomas
0 sherry_cardenas
0 sherry_cardenas
no neighbor cynthia_pena
no neighbor timot

In [7]:
y_predict = clf.predict_proba(X_test, edge_weight)
# print(classification_report(y_test, y_predict))
correct = 0
for i in range(len(y_predict)):
    if y_predict[i][0] > y_predict[i][1] and y_test[i]==0:
        correct += 1
    elif y_predict[i][0] < y_predict[i][1] and y_test[i]==1:
        correct += 1
        
results = []        
for i in range(len(X_test)):
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_prob_confess = y_predict[i][1]
    results.append([num_confessed_assignments, predict_prob_confess])
    
results = sorted(results, key=lambda x: x[1], reverse=True)
df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess_prob'])
df.to_csv('result_prob.csv')

no source alexa_pruitt
0 alexa_pruitt
0 alexa_pruitt
no neighbor robin_coleman
no neighbor jennifer_cox
no neighbor alexander_wells
no neighbor jennifer_lynch
no neighbor ryan_gross
no neighbor natalie_young
no neighbor john_rowe
no neighbor curtis_richmond
0 julie_gibson
0 julie_gibson
no neighbor jeremiah_castro
0 michael_romero
0 michael_romero
no neighbor gregory_foster
no neighbor michael_joseph
0 brandon_meyer
0 brandon_meyer
no neighbor anne_thompson
no neighbor edward_collier
no neighbor laura_salazar
no neighbor shannon_ford
no neighbor walter_johnson
no neighbor christopher_gilmore
no neighbor vanessa_camacho
no neighbor sonya_bowers
no neighbor david_washington
0 tim_hoffman
0 tim_hoffman
no neighbor carolyn_campbell
no neighbor beverly_gay
no neighbor james_webb
0 carolyn_barnes
0 carolyn_barnes
no neighbor darrell_roberson
no neighbor angelica_cohen
0 gina_roberts
0 gina_roberts
no neighbor george_wade
0 lisa_hays
0 lisa_hays
no source robert_steele
0 robert_steele
0 rober