In [1]:
import pandas as pd
import numpy as np
import os

DATA_ROOT = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'data')
NODES = 'nodes_all_raw.csv'
EDGES = os.path.join(DATA_ROOT, 'uniq_lines_edge_weights.csv')

process all nodes

In [2]:
# from src import preprocess
# nodes = preprocess.nodes1(pd.read_csv(NODES, keep_default_na=False))
# print(nodes.isnull().sum())
# nodes.to_csv('nodes_all_raw.csv', index=False)

In [3]:
edges = pd.read_csv(EDGES)

In [4]:
unique_nodes = pd.concat([edges['NodeID1'], edges['NodeID2']], names=['node_id']).drop_duplicates()
unique_nodes = pd.DataFrame(unique_nodes, columns=['name'])
print(unique_nodes.shape)
nodes = pd.read_csv(NODES)
raw_node_data = nodes.merge(unique_nodes, 'inner', 'name')
# unique_nodes_data

(1024, 1)


In [5]:
raw_node_data['node_name'] = raw_node_data['name']

In [6]:
# unique_nodes_data.isnull().sum()

In [7]:
# edges[(edges['NodeID1']=='brian_olsen') | (edges['NodeID2']=='brian_olsen')]

Final features we will be using:
1. What is your major, if outside of Science, use Faculty column. (1-hot encoding)
2. Year of Study (based on admit year, do smart extrapolation)
3. Participation
4. PE_percent (special value -1 for missing values)
5. Finals_percent  (special value -1 for missing values)
6. midterms_percent  (special value -1 for missing values)
7. AFAST (binary)
8. Level_Min_Max
9. EXP_Min_Max
10. Tutorial EXP 9 columns
11. Num videos (Jon says take it with pinch of salt, varies based on whether sem was remote)
12. Avg_videos_completed (Jon says take it with pinch of salt, varies based on whether sem was remote)
13. confessed assignments
14. num_confessed_assignments
15. Batch (one-hot encode)

In [8]:
features = ['year_of_study', 'participation', 
            'pe_percent', 'finals_percent', 'midterms_percent', 
            'afast', 'level_min_max', 'exp_min_max', 
            't01_exp', 't02_exp', 't03_exp', 't04_exp', 't05_exp', 
            't06_exp', 't07_exp', 't08_exp', 't09_exp', 
            'num_videos', 'avg_videos_completion',
            'batch_1821', 'batch_1935', 'batch_2023']

In [9]:
node_data = raw_node_data.fillna(raw_node_data.mean()).sample(frac=1).reset_index(drop=True).set_index('name')
node_data['y'] = node_data['num_confessed_assignments'].apply(lambda x: 1 if x > 0 else 0)

In [29]:
node_data

Unnamed: 0_level_0,year_of_study,participation,pe_percent,finals_percent,midterms_percent,afast,level_min_max,exp_min_max,t01_exp,t02_exp,...,major_NUS Business School,major_Pharmacy,major_Physics,major_Quantitative Finance,major_School of Computing,major_School of Design & Environment,major_Statistics,major_Yong Loo Lin School (Medicine),node_name,y
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gina_roberts,1,3.500000,0.950000,0.880000,0.820000,0.000000,1.050000,1.057083,225.0,225.0,...,0,0,0,0,0,0,0,0,gina_roberts,0
jordan_green,1,3.179201,0.581597,0.489771,0.500000,0.230317,0.916667,0.847881,275.0,250.0,...,0,0,1,0,0,0,0,0,jordan_green,0
jake_garcia,1,3.000000,0.450000,0.466667,0.280000,0.000000,0.800000,0.714487,250.0,225.0,...,0,0,0,0,0,0,0,0,jake_garcia,0
jose_kennedy,2,3.179201,0.581597,0.489771,0.350000,0.230317,0.777778,0.646764,275.0,225.0,...,0,0,0,0,0,0,0,0,jose_kennedy,1
gina_holloway,1,3.000000,0.550000,0.553333,0.360000,0.000000,0.550000,0.461128,200.0,250.0,...,0,0,0,0,0,0,0,0,gina_holloway,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
april_soto,1,4.000000,0.533333,0.310000,0.499269,1.000000,0.960000,0.941851,240.0,240.0,...,0,0,0,0,0,0,0,0,april_soto,1
nicole_ross,2,3.500000,0.300000,0.230000,0.499269,1.000000,0.840000,0.777223,260.0,220.0,...,0,0,0,0,0,0,0,0,nicole_ross,1
eric_odom,1,2.500000,0.000000,0.640000,0.240000,0.000000,0.700000,0.564097,250.0,0.0,...,0,0,0,0,0,0,0,0,eric_odom,0
shawn_melendez,1,3.000000,0.400000,0.210000,0.499269,0.000000,0.720000,0.604286,240.0,240.0,...,0,0,0,0,0,0,0,0,shawn_melendez,1


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [39]:
node_data_true = node_data[node_data['y'] == 1]
print(node_data_true.shape[0])
node_data_false = node_data[node_data['y'] == 0].sample(n=node_data_true.shape[0])
balanced_node_data = pd.concat([node_data_true, node_data_false]).sample(frac=1)
X, y = balanced_node_data[features], balanced_node_data['y']
print(X.shape, y.shape, y.mean())

146
(292, 22) (292,) 0.5


## Split train/test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_test.shape, y_test.shape, y_test[y_test == 1].shape, y_test.mean())

(97, 22) (97,) (60,) 0.6185567010309279


## Normalization

In [13]:
from sklearn.preprocessing import MinMaxScaler
normalized = lambda df: pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns, index=df.index)
X_train_normalized = normalized(X_train)
X_test_normalized = normalized(X_test)

## Node Potential
### SVM

In [14]:
from sklearn.svm import SVC

In [15]:
svm_1 = SVC(kernel='linear', C=1, random_state=42, probability=True)
svm_scores_1 = cross_val_score(svm_1, X_train, y_train, cv=5)
print('svm_1', svm_scores_1)

svm_1 [0.79487179 0.64102564 0.56410256 0.48717949 0.53846154]


### GP

In [16]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [17]:
kernel = 1.0 * RBF(1.0)
gpc_1 = GaussianProcessClassifier(kernel=kernel, random_state=42)
gpc_scores = cross_val_score(gpc_1, X_train_normalized, y_train, cv=5)
print('gpc_1', gpc_scores)

gpc_1 [0.74358974 0.69230769 0.64102564 0.66666667 0.61538462]


In [37]:
node_gpc = GaussianProcessClassifier(kernel=kernel, random_state=42).fit(X_train_normalized, y_train)
print(gpc.score(X_test_normalized, y_test))

0.4536082474226804


In [38]:
pred = node_gpc.predict_proba(normalized(node_data[features]))
pred_df = pd.DataFrame(pred, columns=['0', '1'], index=node_data.index)
pred_df = pred_df.merge(node_data['y'], 'inner', 'name')
pred_df['y'] = pred_df.apply(lambda row: row['y'] if row.name in X_train_normalized.index.to_list() else None, axis=1)
pred_df.to_csv('node_potential.csv')

### KNN

In [52]:
node_data

Unnamed: 0_level_0,year_of_study,participation,pe_percent,finals_percent,midterms_percent,afast,level_min_max,exp_min_max,t01_exp,t02_exp,...,major_NUS Business School,major_Pharmacy,major_Physics,major_Quantitative Finance,major_School of Computing,major_School of Design & Environment,major_Statistics,major_Yong Loo Lin School (Medicine),node_name,y
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gina_roberts,1,3.500000,0.950000,0.880000,0.820000,0.000000,1.050000,1.057083,225.0,225.0,...,0,0,0,0,0,0,0,0,gina_roberts,0
jordan_green,1,3.179201,0.581597,0.489771,0.500000,0.230317,0.916667,0.847881,275.0,250.0,...,0,0,1,0,0,0,0,0,jordan_green,0
jake_garcia,1,3.000000,0.450000,0.466667,0.280000,0.000000,0.800000,0.714487,250.0,225.0,...,0,0,0,0,0,0,0,0,jake_garcia,0
jose_kennedy,2,3.179201,0.581597,0.489771,0.350000,0.230317,0.777778,0.646764,275.0,225.0,...,0,0,0,0,0,0,0,0,jose_kennedy,1
gina_holloway,1,3.000000,0.550000,0.553333,0.360000,0.000000,0.550000,0.461128,200.0,250.0,...,0,0,0,0,0,0,0,0,gina_holloway,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
april_soto,1,4.000000,0.533333,0.310000,0.499269,1.000000,0.960000,0.941851,240.0,240.0,...,0,0,0,0,0,0,0,0,april_soto,1
nicole_ross,2,3.500000,0.300000,0.230000,0.499269,1.000000,0.840000,0.777223,260.0,220.0,...,0,0,0,0,0,0,0,0,nicole_ross,1
eric_odom,1,2.500000,0.000000,0.640000,0.240000,0.000000,0.700000,0.564097,250.0,0.0,...,0,0,0,0,0,0,0,0,eric_odom,0
shawn_melendez,1,3.000000,0.400000,0.210000,0.499269,0.000000,0.720000,0.604286,240.0,240.0,...,0,0,0,0,0,0,0,0,shawn_melendez,1


In [80]:
from sklearn.neighbors import KNeighborsClassifier

all_X_train, all_X_test, all_y_train, all_y_test = \
    train_test_split(node_data[features], node_data['y'], test_size=0.33, random_state=42)

knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_scores_5 = cross_val_score(knn_5, normalized(all_X_train), all_y_train, cv=5)
print('knn_5', knn_scores_5)

knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_scores_10 = cross_val_score(knn_10, normalized(all_X_train), all_y_train, cv=5)
print('knn_10', knn_scores_10)

knn_15 = KNeighborsClassifier(n_neighbors=15)
knn_scores_15 = cross_val_score(knn_15, normalized(all_X_train), all_y_train, cv=5)
print('knn_15', knn_scores_15)

knn_5 [0.80434783 0.86861314 0.8540146  0.81751825 0.81021898]
knn_10 [0.81884058 0.8540146  0.84671533 0.84671533 0.83211679]
knn_15 [0.81884058 0.8540146  0.8540146  0.8540146  0.83941606]


In [82]:
node_knn = KNeighborsClassifier(n_neighbors=15).fit(normalized(all_X_train), all_y_train)
print(node_knn.score(normalized(all_X_test), all_y_test))

0.8579881656804734


In [93]:
node_knn.score(X_test_normalized, y_test)

0.4020618556701031

In [83]:
pred = node_knn.predict_proba(normalized(node_data[features]))
pred_df = pd.DataFrame(pred, columns=['0', '1'], index=node_data.index)
pred_df['y_pred'] = node_knn.predict(normalized(node_data[features]))
pred_df = pred_df.merge(node_data['y'], 'inner', 'name')
pred_df['y_obs'] = pred_df.apply(lambda row: row['y'] if row.name in X_train_normalized.index.to_list() else None, axis=1)
pred_df.to_csv('node_potential.csv')

## Edge Potential

In [22]:
edges

Unnamed: 0,NodeID1,NodeID2,edge_weights
0,aaron_christian,marc_key,0.476504
1,aaron_christian,joseph_trevino,0.329899
2,aaron_pope,annette_solis,0.347540
3,aaron_pope,carol_harris,0.419707
4,aaron_powell,jennifer_simmons,0.366710
...,...,...,...
3382,zachary_beasley,robert_lopez,0.481366
3383,zachary_beasley,steven_adams,0.676140
3384,zachary_carter,richard_higgins,0.362905
3385,zachary_hernandez,thomas_barnes,1.284771


In [23]:
edge_data = edges.merge(node_data[['node_name', 'y']], how='left', left_on='NodeID1', right_on='node_name') \
                    .rename(columns={'y': 'y_1'})

edge_data = edge_data.merge(node_data[['node_name', 'y']], how='left', left_on='NodeID2', right_on='node_name') \
                    .rename(columns={'y': 'y_2'}).drop(['node_name_x', 'node_name_y'], axis=1)

def get_distance(node_name_1, node_name_2, df, features):
    return np.linalg.norm(df.loc[node_name_1][features] - df.loc[node_name_2][features])

def get_dot(node_name_1, node_name_2, df, features):
    return np.dot(df.loc[node_name_1][features], df.loc[node_name_2][features])

edge_data['distance'] = edge_data.apply(lambda row: get_distance(row['NodeID1'], row['NodeID2'], normalized(node_data[features]), features), axis=1)
edge_data['dot'] = edge_data.apply(lambda row: get_dot(row['NodeID1'], row['NodeID2'], normalized(node_data[features]), features), axis=1)

edge_data['y'] = edge_data.apply(lambda row: row['y_1'] + row['y_2'], axis=1)
edge_data['yy'] = edge_data.apply(lambda row: str(row['y_1']) + str(row['y_2']), axis=1)

edge_data = edge_data.set_index(['NodeID1', 'NodeID2'])
edge_data

Unnamed: 0_level_0,Unnamed: 1_level_0,edge_weights,y_1,y_2,distance,dot,y,yy
NodeID1,NodeID2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aaron_christian,marc_key,0.476504,1,1,1.732988,7.802318,2,11
aaron_christian,joseph_trevino,0.329899,1,0,0.991159,8.034936,1,10
aaron_pope,annette_solis,0.347540,0,0,1.843655,7.970748,0,00
aaron_pope,carol_harris,0.419707,0,0,1.398261,9.339292,0,00
aaron_powell,jennifer_simmons,0.366710,1,1,1.506382,8.654899,2,11
...,...,...,...,...,...,...,...,...
zachary_beasley,robert_lopez,0.481366,0,1,0.998299,9.044916,1,01
zachary_beasley,steven_adams,0.676140,0,0,1.505752,9.580183,0,00
zachary_carter,richard_higgins,0.362905,0,0,1.617848,8.790256,0,00
zachary_hernandez,thomas_barnes,1.284771,0,0,0.606826,10.615099,0,00


In [24]:
min_count = edge_data['y'].value_counts().min()
edge_data_0 = edge_data[edge_data['y'] == 0].sample(n=min_count)
edge_data_1 = edge_data[edge_data['y'] == 1].sample(n=min_count)
edge_data_2 = edge_data[edge_data['y'] == 2].sample(n=min_count)

balanced_edge_data = pd.concat([edge_data_0, edge_data_1, edge_data_2]).sample(frac=1)

edge_features = ['edge_weights', 'distance', 'dot']
edge_X, edge_y = balanced_edge_data[edge_features], balanced_edge_data['y']
print(edge_X.shape, edge_y.shape, edge_y.mean())

(318, 3) (318,) 1.0


In [25]:
edge_X_train, edge_X_test, edge_y_train, edge_y_test = train_test_split(edge_X, edge_y, test_size=0.33, random_state=37)
print(X_test.shape, edge_y_test.shape)
print(np.count_nonzero(edge_y_test == 0), np.count_nonzero(edge_y_test == 1), np.count_nonzero(edge_y_test == 2))

(97, 22) (105,)
41 30 34


### GP

In [26]:
kernel = 1.0 * RBF(1.0)
gpc_1 = GaussianProcessClassifier(kernel=kernel, random_state=42)
gpc_scores = cross_val_score(gpc_1, edge_X_train, edge_y_train, cv=5)
print('gpc_1', gpc_scores)

gpc_1 [0.51162791 0.3255814  0.44186047 0.33333333 0.42857143]


In [27]:
gpc = GaussianProcessClassifier(kernel=kernel, random_state=42).fit(edge_X_train, edge_y_train)
print(gpc.score(edge_X_test, edge_y_test))

0.42857142857142855


In [28]:
pred = gpc.predict_proba(edge_data[edge_features])
pred_df = pd.DataFrame(pred, columns=['0', '1', '2'], index=edge_data.index)
pred_df.to_csv('edge_potential.csv')

### KNN

In [81]:
all_edge_X_train, all_edge_X_test, all_edge_y_train, all_edge_y_test = \
    train_test_split(edge_data[edge_features], edge_data['y'], test_size=0.33, random_state=42)

edge_knn_5 = KNeighborsClassifier(n_neighbors=5)
edge_knn_scores_5 = cross_val_score(edge_knn_5, all_edge_X_train, all_edge_y_train, cv=5)
print('knn_5', edge_knn_scores_5)

edge_knn_10 = KNeighborsClassifier(n_neighbors=10)
edge_knn_scores_10 = cross_val_score(edge_knn_10, all_edge_X_train, all_edge_y_train, cv=5)
print('knn_10', edge_knn_scores_10)

edge_knn_15 = KNeighborsClassifier(n_neighbors=15)
edge_knn_scores_15 = cross_val_score(knn_15, all_edge_X_train, all_edge_y_train, cv=5)
print('knn_15', edge_knn_scores_15)

knn_5 [0.64757709 0.68502203 0.66740088 0.64537445 0.66887417]
knn_10 [0.69823789 0.70264317 0.70484581 0.68942731 0.72406181]
knn_15 [0.71585903 0.70704846 0.71145374 0.70044053 0.71302428]


In [87]:
edge_knn = KNeighborsClassifier(n_neighbors=15).fit(all_edge_X_train, all_edge_y_train)
print(edge_knn.score(all_edge_X_test, all_edge_y_test))

0.7155635062611807


In [89]:
pred = edge_knn.predict_proba(edge_data[edge_features])
pred_df = pd.DataFrame(pred, columns=['0', '1', '2'], index=edge_data.index)
pred_df.to_csv('edge_potential.csv')

## Run main

In [92]:
marginals = pd.read_csv('marginals.csv')
marginals = marginals.merge(node_data[['node_name', 'y']], 'left', left_on='name', right_on='node_name')

obs = pd.read_csv('node_potential.csv')[['name', 'y_obs']]
marginals = marginals.merge(obs, 'left', 'name')

marginals[['name', 'y_pred', 'y_obs', 'y']] \
    .sort_values(['y_pred', 'y_obs', 'y'], ascending=[False, False, False]) \
    .to_csv('results.csv', index=False)