In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from iterative_classifier import IterativeClassifier
from sklearn.metrics import classification_report

### Extract numerical features from the csv file

In [2]:
col = ['name', 'admit_year','participation', 'pe', 'finals', 'total', 'percentile', 
       'afast', 'pe_percent', 'finals_percent', 'midterms', 'midterms_percent', 'level',
       'level_min_max', 'exp', 'exp_min_max', 'num_videos', 'avg_videos_completion',
       't01_exp', 't02_exp', 't03_exp', 't04_exp', 't05_exp', 't06_exp', 't07_exp',
       't08_exp', 't09_exp', 't10_exp', 'num_confessed_assignments']
df = pd.read_csv('../data/unified_node_data.csv', keep_default_na=False)[col].set_index('name')
df = df.replace('-', 0).replace('', 0).replace('True', 1).replace('False', 0)
df = df.astype(float)
names = df.index.to_numpy()

### Generate edge weight list (avg/max/min)

In [3]:
avg_edge_weights = pd.read_csv('../data/avg_edge_weights.csv', keep_default_na=False).to_numpy()
max_edge_weights = pd.read_csv('../data/max_edge_weights.csv', keep_default_na=False).to_numpy()
min_edge_weights = pd.read_csv('../data/min_edge_weights.csv', keep_default_na=False).to_numpy()

# dict: 
# key - source
# value - [destination, avg, max, min]
edge_weight = {}
for i in range(len(avg_edge_weights)):
    source = avg_edge_weights[i][0]
    if not source in edge_weight:
        edge_weight[source] = []
    edge_weight[source].append([avg_edge_weights[i][1], avg_edge_weights[i][2], max_edge_weights[i][2], min_edge_weights[i][2]])

### Split the dataset into training and test set

In [4]:
X = df.values.tolist()
y = []
for i in range(len(X)):
    x = X[i]
    if x[len(x)-1] > 0:
        y.append(1)
    else:
        y.append(0)
    X[i].insert(0, names[i])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
clf = IterativeClassifier(n_neighbors=9)
clf.fit(X_train, edge_weight, y_train)

no neighbor bryan_george
0 amanda_smith
no source shannon_carpenter
0 shannon_carpenter
0 shannon_carpenter
0 jacqueline_middleton
no source brandon_cruz
0 brandon_cruz
0 brandon_cruz
no neighbor jonathan_martin
0 susan_short
0 colleen_davis
0 tiffany_bailey
no source david_harris
0 david_harris
0 david_harris
0 michael_fitzgerald
no neighbor corey_burns
no neighbor dr._samuel_sharp_dds
no neighbor jordan_alexander
no neighbor erin_ortiz
no neighbor kristina_wilcox
no neighbor deanna_roberts
no neighbor steven_adams
0 stacy_brown
0 michael_stewart
no source jennifer_stewart
0 jennifer_stewart
0 jennifer_stewart
no neighbor samuel_hayes
0 adrian_king
no neighbor matthew_lang
0 adrian_barnes_dds
no neighbor jennifer_torres
0 laura_perry
0 frank_williams
no neighbor natalie_young
no neighbor cynthia_pena
0 keith_garcia
no neighbor brian_jenkins
no source mason_herrera
0 mason_herrera
0 mason_herrera
0 ashley_johnson
0 shelley_wells
no source lindsay_rodriguez
0 lindsay_rodriguez
0 lindsay

no source rachel_jones
0 rachel_jones
0 rachel_jones
no source william_griffin
0 william_griffin
0 william_griffin
0 ivan_kramer
no source jimmy_moore
0 jimmy_moore
0 jimmy_moore
0 wesley_molina
no neighbor jack_lopez
no neighbor jose_kennedy
no neighbor john_stewart
no neighbor grace_roberts
no neighbor david_washington
no neighbor dr._michael_garza_jr.
0 jessica_zuniga
0 jessica_zuniga
no neighbor david_washington
no neighbor frederick_garza
0 russell_sutton
no source tracy_morrison
0 tracy_morrison
0 tracy_morrison
no neighbor hayley_moore
0 diana_lee
0 diana_lee
0 joseph_garcia
no neighbor samuel_hayes
0 crystal_smith
0 jennifer_wilkerson
0 peter_hester
no neighbor david_washington
no neighbor carlos_terry
no neighbor jesse_perry
no neighbor frank_burns
0 john_rowe
0 mrs._amy_white_phd
no source joe_henderson
0 joe_henderson
0 joe_henderson
0 larry_franklin
no neighbor jennifer_torres
0 sean_romero
no source timothy_francis
0 timothy_francis
0 timothy_francis
0 jeffrey_ruiz
no sour

In [6]:
y_predict = clf.predict(X_test, edge_weight)
print(classification_report(y_test, y_predict))
        
results = []
names = []
for i in range(len(X_test)):
    names.append(X_test[i][0])
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_confess = y_predict[i]
    results.append([num_confessed_assignments, predict_confess])

df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess'])
df.to_csv('result.csv')

no source cody_morris
0 cody_morris
0 cody_morris
no neighbor meagan_boyle
no neighbor shelby_woods
0 dana_clark
no neighbor jeffery_knight
no neighbor emily_thompson
no neighbor jose_hicks
no neighbor gary_snyder
no neighbor stacy_brown
no neighbor cody_johnson
no neighbor richard_morales
0 mitchell_bennett
no neighbor gina_roberts
0 meredith_rios
no neighbor timothy_tucker
no neighbor christian_jackson
no neighbor jill_ferguson
0 michael_alexander
0 michael_alexander
no neighbor cesar_elliott
0 corey_burns
0 corey_burns
no neighbor kenneth_allen
no neighbor kristi_stevens
no neighbor brandi_duncan
no neighbor donald_williams
no neighbor mrs._mary_hammond
no neighbor james_long
no neighbor deanna_perry
0 heather_clark
no neighbor robert_jones
no neighbor matthew_graves
no neighbor debra_mendoza
no neighbor jennifer_summers
0 vincent_adams
no neighbor kenneth_allen
no neighbor lindsay_moore
no neighbor carol_harris
0 barbara_nunez
no source lori_rodriguez
0 lori_rodriguez
0 lori_rodrig

0 deborah_williams
no source alexis_phillips
0 alexis_phillips
0 alexis_phillips
no source kimberly_thomas
0 kimberly_thomas
0 kimberly_thomas
no source gary_evans
0 gary_evans
0 gary_evans
no neighbor sonya_bowers
no neighbor laura_salazar
no neighbor ashley_arnold_md
no neighbor shannon_ford
no neighbor paula_guerrero
no neighbor edward_collier
no neighbor david_decker
no neighbor kenneth_gross
no neighbor traci_rowland
no neighbor tim_hoffman
no neighbor luis_armstrong
no neighbor walter_johnson
no neighbor matthew_martinez
no neighbor christopher_gilmore
no neighbor anne_thompson
0 jesse_perry
no neighbor justin_young
0 brian_cortez
0 brian_cortez
no source karen_houston
0 karen_houston
0 karen_houston
no neighbor jessica_zuniga
0 justin_zimmerman
no source jennifer_nguyen
0 jennifer_nguyen
0 jennifer_nguyen
no neighbor angela_marshall
no neighbor sarah_stark
no neighbor jennifer_miller
0 reginald_garrison
0 reginald_garrison
no neighbor tara_bonilla
no neighbor julie_gibson
no nei

In [12]:
y_predict = clf.predict_proba(X_test, edge_weight)
# print(classification_report(y_test, y_predict))
correct = 0
for i in range(len(y_predict)):
    if y_predict[i][0] > y_predict[i][1] and y_test[i]==0:
        correct += 1
    elif y_predict[i][0] < y_predict[i][1] and y_test[i]==1:
        correct += 1
        
results = []        
for i in range(len(X_test)):
    num_confessed_assignments = X_test[i][len(X_test[i])-1]
    predict_prob_confess = y_predict[i][1]
    results.append([num_confessed_assignments, predict_prob_confess])
    
results = sorted(results, key=lambda x: x[1], reverse=True)
df = pd.DataFrame(results, index=names,columns=['num_confessed_assignments', 'predict_confess_prob'])
df.to_csv('result_prob.csv')

no source cody_morris
0 cody_morris
0 cody_morris
no neighbor meagan_boyle
no neighbor joseph_martin_dvm
no neighbor shelby_woods
0 dana_clark
0 dana_clark
no neighbor jeffery_knight
no neighbor emily_thompson
no neighbor jose_hicks
no neighbor erin_ortiz
no neighbor steven_adams
no neighbor dr._samuel_sharp_dds
no neighbor gary_snyder
no neighbor stacy_brown
no neighbor cody_johnson
no neighbor richard_morales
0 mitchell_bennett
0 mitchell_bennett
no neighbor angelica_cohen
no neighbor christopher_lewis
no neighbor gina_roberts
0 meredith_rios
0 meredith_rios
no neighbor timothy_tucker
no neighbor christian_jackson
no neighbor jill_ferguson
0 michael_alexander
0 michael_alexander
no neighbor cesar_elliott
0 corey_burns
0 corey_burns
no neighbor kenneth_allen
no neighbor kristi_stevens
no neighbor brandi_duncan
no neighbor donald_williams
no neighbor mrs._mary_hammond
no neighbor james_long
no neighbor kristina_wilcox
no neighbor deanna_perry
0 heather_clark
0 heather_clark
no neighbor