In [43]:
from aws_helper_functions import aws_helper_functions
from sklearn.neighbors import BallTree
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import numpy as np
import torch
import copy
import tqdm

In [58]:
def _retrieve_nearest_census_tract_numbers(df, local_mode=''):
    # retrive census tract information from redshift
    df_census = aws_helper_functions.read_from_redshift('SELECT * FROM raw_data_science.raw_commute_census_tracts_lat_long', local_mode=local_mode)
    # Create a BallTree for the census tract latitudes and longitudes
    tree = BallTree(df_census[['lat_orig', 'long_orig']].values, leaf_size=40)
    
    #drop rows in df where students_home__latitude__s or students_home__longitude__s is null
    #df = df.dropna(subset=['latitude', 'longitude'])
    df['latitude'] = np.where(df['latitude'].isna()==True, 40.776676, df['latitude'])
    df['longitude'] = np.where(df['longitude'].isna()==True, -73.971321, df['longitude'])
    
    distances, indices = tree.query(df[['latitude', 'longitude']].values, k=1)
    df.loc[:, 'boro_int'] = df_census.loc[indices.flatten(), 'boro_int'].values.copy()
    df.loc[:, 'census_tract_int'] = df_census.loc[indices.flatten(), 'census_tract_int'].values.copy()
    return df

def _get_commute_time(df, local_mode=''):
    #retrieve school-census tract commute times from redshift
    df_commutes = aws_helper_functions.read_from_redshift('SELECT * FROM raw_data_science.raw_commute_census_tracts_to_schools', local_mode=local_mode)
    
    schools = _get_schools()
    df_commutes = _replace_with_keys(df_commutes, 'school', schools)
    #set df_commutes.time_walking_min, time_transit_min, and time_driving_min to numeric
    df_commutes['time_walking_min'] = pd.to_numeric(df_commutes['time_walking_min'], errors='coerce')
    df_commutes['time_transit_min'] = pd.to_numeric(df_commutes['time_transit_min'], errors='coerce')
    df_commutes['time_driving_min'] = pd.to_numeric(df_commutes['time_driving_min'], errors='coerce')
    
    df_commutes['commute_time'] = df_commutes[['time_walking_min', 'time_transit_min']].min(axis=1)
    df_commutes = df_commutes[['boro_int', 'census_tract_int', 'school', 'commute_time']]
    
    df = df.merge(df_commutes, how='left', on=['boro_int', 'census_tract_int', 'school'])
    df['commute_time'] = np.where(df['commute_time']>120,30,df['commute_time'])
    return df

def _get_schools():
    schools = {
        'SA Bed-Stuy 2': 'BED-STUY2',
        'SA Bed-Stuy': 'BED-STUY2',
        'SA Bed-Stuy Middle School': 'BED-STUY_MIDDLE_SCHOOL',
        'SA Bensonhurst': 'BENSONHURST',
        'SA Bergen Beach':'BERGEN_BEACH',
        'SA Bronx 1 Middle School': 'BRONX1',
        'SA Bronx 1': 'BRONX1',
        'SA Bronx Middle School': 'BRONX_MIDDLE_SCHOOL',
        'SA Bronx 2': 'BRONX2',
        'SA Bronx 2 Middle School': 'BRONX2_MIDDLE_SCHOOL',
        'SA Bronx 3': 'BRONX3',
        'SA Bronx 4': 'BRONX4',
        'SA Bronx 5': 'BRONX5',
        'SA Bronx 5 Upper': 'BRONX5',
        'SA Bronx 5 Lower': 'BRONX5',
        'SA Bushwick': 'BUSHWICK',
        'SA Cobble Hill': 'COBBLE_HILL',
        'SA Crown Heights': 'CROWN_HEIGHTS',
        'SA Ditmas Park Middle School': 'DITMAS_PARK_MIDDLE_SCHOOL',
        'SA East Flatbush Middle School': 'EAST_FLATBUSH_MIDDLE_SCHOOL',
        'SA Far Rockaway': 'FAR_ROCKAWAY',
        'SA Far Rockaway Middle School': 'FAR_ROCKAWAY_MIDDLE_SCHOOL',
        'SA Flatbush': 'FLATBUSH',
        'SA Hamilton Heights Middle School': 'HARLEM6',
        'SA Harlem 1': 'HARLEM1',
        'SA Harlem 2': 'HARLEM2',
        'SA Harlem 3': 'HARLEM3',
        'SA Harlem 4': 'HARLEM4',
        'SA Harlem 5': 'HARLEM5',
        'SA Harlem 6': 'HARLEM6',
        'SA Harlem East': 'HARLEM_EAST',
        'SA Harlem East Middle School': 'HARLEM_EAST',
        'SA Harlem North Central': 'HARLEM_NORTH_CENTRAL',
        'SA Harlem North Central Middle School': 'HARLEM_NORTH_CENTRAL',
        'SA Harlem West': 'HARLEM_WEST',
        'SA Harlem West Middle School': 'HARLEM_WEST',
        'SA Harlem North West': 'HARLEM_NORTH_WEST',
        'SA Harlem North West Middle School': 'HARLEM_NORTH_WEST',
        'SA Hells Kitchen': 'HELLS_KITCHEN',
        'SA Hell\'s Kitchen': 'HELLS_KITCHEN',
        'SA High School of the Liberal Arts - Manhattan': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_MANHATTAN',
        'SA High School of the Liberal Arts-Manhattan': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_MANHATTAN',
        'SA High School of the Liberal Arts - Harlem': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_HARLEM',
        'SA High School of the Liberal Arts-Harlem': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_HARLEM',
        'SA High School of the Liberal Arts - Brooklyn': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_BROOKLYN',
        'SA High School of the Liberal Arts-Brooklyn': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_BROOKLYN',
        'SA Hudson Yards': 'HUDSON_YARDS',
        'SA Hudson Yards Middle School': 'HUDSON_YARDS_MIDDLE_SCHOOL',
        'SA Kingsbridge Heights': 'KINGSBRIDGE_HEIGHTS',
        'SA Lafayette Middle School': 'LAFAYETTE_MIDDLE_SCHOOL',
        'SA Midtown West Middle School': 'MIDTOWN_WEST',
        'SA Myrtle Middle School': 'MYRTLE_MIDDLE_SCHOOL',
        'SA Norwood': 'NORWOOD',
        'SA Ozone Park Middle School': 'OZONE_PARK_MIDDLE_SCHOOL',
        'SA Prospect Heights': 'PROSPECT_HEIGHTS',
        'SA Queens Village': 'QUEENS_VILLAGE',
        'SA Rosedale': 'ROSEDALE',
        'SA Rockaway Park Middle School': 'ROCKAWAY_PARK_MIDDLE_SCHOOL',
        'SA South Jamaica': 'SOUTH_JAMAICA',
        'SA Sheepshead Bay': 'SHEEPSHEAD_BAY',
        'SA Springfield Gardens Middle School': 'SPRINGFIELD_GARDENS',
        'SA Springfield Gardens MS': 'SPRINGFIELD_GARDENS',
        'SA Springfield Gardens': 'SPRINGFIELD_GARDENS',
        'SA Union Square': 'UNION_SQUARE',
        'SA Upper West': 'UPPER_WEST',
        'SA Washington Heights': 'WASHINGTON_HEIGHTS',
        'SA Williamsburg': 'WILLIAMSBURG',
        }
    return schools

def _replace_with_keys(df, column, dictionary):
    new_df = pd.DataFrame()
    for key, value in dictionary.items():
        temp_df = df[df[column] == value].copy()
        temp_df[column] = key
        new_df = pd.concat([new_df, temp_df])
    return new_df

def _get_table_data(basetable=''):
    raw_basetable = pd.read_csv(basetable)
    df = raw_basetable
    df = df[~df['grade'].isin(['PK', 'Not In School'])]
    df = df[~df['grade'].isnull()]
    df['scholar_grade'] = np.where(df['grade']=='K','0',df['grade']).astype(int)
    df['latitude'] = df['students_home__latitude__s']
    df['longitude'] = df['students_home__longitude__s']
    df['school'] = df['accepted_school']
    df = _get_commute_time(_retrieve_nearest_census_tract_numbers(df,local_mode=True),local_mode=True)
    df['intercept'] = 1
    df['es_school'] = df['scholar_grade'].isin(np.arange(0,5)).astype(int)
    df['ms_school'] = df['scholar_grade'].isin(np.arange(5,6)).astype(int)
    df['log_commute'] = np.log(df['commute_time'])
    df['log_commute_square'] = df['log_commute'] ** 2
    df['log_commute_third'] = df['log_commute'] ** 3
    df = df[['intercept','yield','uniform_ordered','accepted_first_rank','had_enrolled_sib','ell_status','homeless_status','es_school','ms_school','orientation_rsvp','virtual_event_attended','in_person_event_attended',
            'scholar_grade','commute_time',
            'log_commute','log_commute_square','log_commute_third',
            'school','utm_source_bucketing']]
    df = df.dropna()
    return df

def set_predictors(current_predictors):
    predictors = current_predictors
    return predictors

def set_target(current_target):
    target = current_target
    return target

def setup_training_data(df, predictors, target):
    X = df[predictors]
    Y = df[target]
    return X, Y

def test_train_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
    return X_train, X_test, y_train, y_test

class Deep(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = torch.nn.Linear(len(predictors), len(predictors))
        self.act1 = torch.nn.ReLU()
        self.layer2 = torch.nn.Linear(len(predictors), len(predictors))
        self.act2 = torch.nn.ReLU()
        self.layer3 = torch.nn.Linear(len(predictors), len(predictors))
        self.act3 = torch.nn.ReLU()
        self.output = torch.nn.Linear(len(predictors), 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x
    
    def model_train(model, X_train, y_train, X_val, y_val):
    # loss function and optimizer
        loss_fn = torch.nn.BCELoss()  # binary cross entropy
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

        n_epochs = 250   # number of epochs to run
        batch_size = 10  # size of each batch
        batch_start = torch.arange(0, len(X_train), batch_size)

        # Hold the best model
        best_acc = - np.inf   # init to negative infinity
        best_weights = None

        for epoch in range(n_epochs):
            # print(epoch)
            model.train()
            with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
                bar.set_description(f"Epoch {epoch}")
                for start in bar:
                    # take a batch
                    X_batch = X_train[start:start+batch_size]
                    y_batch = y_train[start:start+batch_size]
                    # forward pass
                    y_pred = model(X_batch)
                    loss = loss_fn(y_pred, y_batch)
                    # backward pass
                    optimizer.zero_grad()
                    loss.backward()
                    # update weights
                    optimizer.step()
                    # print progress
                    acc = (y_pred.round() == y_batch).float().mean()
                    bar.set_postfix(
                        loss=float(loss),
                        acc=float(acc)
                    )
            # evaluate accuracy at end of each epoch
            model.eval()
            y_pred = model(X_val)
            acc = (y_pred.round() == y_val).float().mean()
            acc = float(acc)
            if acc > best_acc:
                best_acc = acc
                best_weights = copy.deepcopy(model.state_dict())
        # restore model and return best accuracy
        model.load_state_dict(best_weights)
        return best_acc

In [15]:
df = _get_table_data('train_basetable.csv')
df

Unnamed: 0,intercept,yield,uniform_ordered,accepted_first_rank,had_enrolled_sib,ell_status,homeless_status,es_school,ms_school,orientation_rsvp,virtual_event_attended,in_person_event_attended,scholar_grade,commute_time,log_commute,log_commute_square,log_commute_third,school,utm_source_bucketing
0,1,0,0,1,0,1,0,1,0,0,0,0,1,46.616667,3.841958,14.760642,56.709770,SA Bronx 4,Organic / No Tracking
1,1,0,0,1,0,1,0,1,0,0,0,0,1,46.616667,3.841958,14.760642,56.709770,SA Bronx 4,Organic / No Tracking
2,1,0,0,0,0,1,0,1,0,0,0,0,3,80.066667,4.382860,19.209458,84.192360,SA Washington Heights,Google Branded Search
3,1,1,1,1,0,1,0,1,0,1,0,0,3,5.100000,1.629241,2.654425,4.324696,SA Far Rockaway,Organic / No Tracking
4,1,0,0,0,0,0,0,1,0,0,0,0,2,34.000000,3.526361,12.435219,43.851064,SA Bronx 2,Organic / No Tracking
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19269,1,0,0,1,0,0,0,1,0,1,0,0,2,8.800000,2.174752,4.729545,10.285586,SA Washington Heights,Organic / No Tracking
19270,1,0,0,1,0,1,0,1,0,0,0,0,2,27.166667,3.301991,10.903143,36.002076,SA Bergen Beach,Organic / No Tracking
19271,1,0,0,1,0,1,0,1,0,0,0,0,0,12.283333,2.508243,6.291285,15.780073,SA Bronx 1,Google Branded Search
19272,1,0,1,1,0,0,0,1,0,0,0,0,0,28.200000,3.339322,11.151071,37.237017,SA Harlem 3,Organic / No Tracking


In [34]:
target = set_target(['yield'])
predictors = set_predictors(['uniform_ordered','accepted_first_rank','had_enrolled_sib',
                            'orientation_rsvp','virtual_event_attended','in_person_event_attended'])
X, y =  setup_training_data(df, predictors, target)
X = torch.tensor(X.values, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32)

In [48]:
# train-test split: Hold out the test set for final model evaluation
# define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True)
cv_scores = []
for train, test in kfold.split(X, y):
    # create model, train, and get accuracy
    model = Deep()
    acc = model_train(model, X[train], y[train], X[test], y[test])
    print("Accuracy (wide): %.2f" % acc)
    cv_scores.append(acc)

# evaluate the model
acc = np.mean(cv_scores)
std = np.std(cv_scores)
print("Model accuracy: %.2f%% (+/- %.2f%%)" % (acc*100, std*100))

Accuracy (wide): 0.89
Accuracy (wide): 0.90
Accuracy (wide): 0.89
Accuracy (wide): 0.89
Accuracy (wide): 0.89
Model accuracy: 89.18% (+/- 0.46%)
