In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from iterative_classifier import IterativeClassifier

### Extract numerical features from the csv file

In [2]:
col = ['name', 'admit_year','participation', 'pe', 'finals', 'total', 'percentile', 
       'afast', 'pe_percent', 'finals_percent', 'midterms', 'midterms_percent', 'level',
       'level_min_max', 'exp', 'exp_min_max', 'num_videos', 'avg_videos_completion',
       't01_exp', 't02_exp', 't03_exp', 't04_exp', 't05_exp', 't06_exp', 't07_exp',
       't08_exp', 't09_exp', 't10_exp', 'num_confessed_assignments']
df = pd.read_csv('../data/unified_node_data.csv', keep_default_na=False)[col].set_index('name')
df = df.replace('-', 0).replace('', 0).replace('True', 1).replace('False', 0)
df = df.astype(float)
names = df.index.to_numpy()

### Generate edge weight list (avg/max/min)

In [3]:
avg_edge_weights = pd.read_csv('../data/avg_edge_weights.csv', keep_default_na=False).to_numpy()
max_edge_weights = pd.read_csv('../data/max_edge_weights.csv', keep_default_na=False).to_numpy()
min_edge_weights = pd.read_csv('../data/min_edge_weights.csv', keep_default_na=False).to_numpy()

# dict: 
# key - source
# value - [destination, avg, max, min]
edge_weight = {}
for i in range(len(avg_edge_weights)):
    source = avg_edge_weights[i][0]
    if not source in edge_weight:
        edge_weight[source] = []
    edge_weight[source].append([avg_edge_weights[i][1], avg_edge_weights[i][2], max_edge_weights[i][2], min_edge_weights[i][2]])

### Split the dataset into training and test set

In [4]:
X = df.values.tolist()
y = []
for i in range(len(X)):
    x = X[i]
    if x[len(x)-1] > 0:
        y.append(1)
    else:
        y.append(0)
    X[i].insert(0, names[i])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
clf = IterativeClassifier(n_neighbors=9)
clf.fit(X_train, edge_weight, y_train)

no neighbor amy_miller
0 richard_nelson
no neighbor allen_bailey
0 amanda_thompson
0 amanda_thompson
no source dustin_guerra
no neighbor allen_bailey
0 dustin_guerra
0 dustin_guerra
no source pamela_colon
no neighbor allen_bailey
0 pamela_colon
0 pamela_colon
0 raymond_guerra
0 monica_carr
no neighbor justin_edwards
0 lisa_johnson
0 timothy_tucker
no source mrs._laura_johnston_dds
0 mrs._laura_johnston_dds
0 lindsay_arnold
no source jeremiah_oconnell
0 jeremiah_oconnell
0 jonathan_daniels
0 jacqueline_middleton
0 elizabeth_smith
no neighbor ryan_hartman
0 jessica_simon
0 kim_jones
no neighbor martin_payne
no neighbor christine_young
no neighbor dr._samuel_sharp_dds
no neighbor justin_edwards
no neighbor richard_morales
0 stacy_brown
no neighbor adrian_pineda
no source lindsay_rodriguez
no neighbor adrian_pineda
0 april_soto
no neighbor charles_mccoy
0 connie_anderson
no neighbor patricia_brown
no source anthony_everett
no neighbor patricia_brown
0 pamela_gibbs
no neighbor patricia_brow

no source teresa_bird
0 teresa_bird
no neighbor samantha_duncan
0 jeffrey_ruiz
0 kimberly_lam
0 jordan_alexander
0 mr._jacob_flynn_jr.
no neighbor donald_williams
no neighbor frank_martin
no neighbor kristi_stevens
0 jody_martinez
0 samantha_wang
no source brandon_burgess
0 brandon_burgess
no neighbor daniel_graham
no neighbor michelle_smith
0 sean_park_dds
0 christopher_medina
no neighbor holly_crane
no neighbor jason_kelly
no neighbor kara_medina
no neighbor kara_medina
no source monica_brown
no neighbor kara_medina
no neighbor mark_young
0 melissa_barnes
no source sean_foster
no neighbor mark_young
0 sean_foster
no neighbor kenneth_gross
0 jason_haynes
no source craig_brooks
no neighbor kenneth_gross
0 craig_brooks
no source ryan_gross
no neighbor kenneth_gross
0 ryan_gross
no neighbor eric_stone
no neighbor luis_armstrong
0 phillip_hicks
0 phillip_hicks
0 mark_bell
no neighbor jason_thomas
0 sherry_cardenas
0 sherry_cardenas
no neighbor trevor_ford
0 pamela_kelley
0 michael_jordan


In [10]:
correct = 0
y_predict = clf.predict(X_test, edge_weight)
for i in range(len(y_test)):
    if y_test[i] == y_predict[i]:
        correct += 1

no neighbor sierra_montoya
0 kerry_johnson
0 kerry_johnson
no source julia_hernandez
no neighbor sierra_montoya
0 julia_hernandez
0 julia_hernandez
no source jimmy_bennett
no neighbor sierra_montoya
0 jimmy_bennett
0 jimmy_bennett
no source robert_rogers
no neighbor sierra_montoya
0 robert_rogers
0 robert_rogers
0 ryan_moore
no neighbor curtis_richmond
no neighbor erin_ortiz
no neighbor gary_snyder
no neighbor jared_johnson
no neighbor jennifer_wilkerson
no neighbor justin_riley
no neighbor keith_morales
no neighbor lauren_ramirez
no neighbor sabrina_clark
0 dr._samuel_sharp_dds
no neighbor andrew_sanders
0 kevin_wade
0 kevin_wade
no neighbor joshua_smith
0 amber_wright
no neighbor jody_martinez
0 ashley_kemp
0 ashley_kemp
no source holly_crane
no neighbor jody_martinez
0 holly_crane
0 holly_crane
no neighbor brandi_duncan
no neighbor jessica_bryant
no neighbor richard_mendoza
0 ryan_hartman
no neighbor debra_aguilar
no neighbor frank_burns
no neighbor hannah_anderson
no neighbor jerem

In [11]:
print(correct/len(y_test))

0.8690476190476191


In [8]:
print(y_predict)

[1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0]
