In [14]:
from aws_helper_functions import aws_helper_functions
from sklearn.neighbors import BallTree
from sklearn.model_selection import train_test_split
import sklearn.metrics as sk_metrics
import pandas as pd
import numpy as np
import tensorflow as tf

In [20]:
# Set hyperparameters
num_classes = 1
num_features = 64
learning_rate = 0.01
training_steps = 1000
batch_size = 200
display_step = 50

def _get_table_data(basetable=''):
    raw_basetable = pd.read_csv(basetable)
    df = raw_basetable
    df = df[~df['grade'].isin(['PK', 'Not In School'])]
    df = df[~df['grade'].isnull()]
    df['scholar_grade'] = np.where(df['grade']=='K','0',df['grade']).astype(int)
    df['latitude'] = df['students_home__latitude__s']
    df['longitude'] = df['students_home__longitude__s']
    df['school'] = df['accepted_school']
    df = _get_commute_time(_retrieve_nearest_census_tract_numbers(df,local_mode=True),local_mode=True)
    df['intercept'] = 1
    df['es_school'] = df['scholar_grade'].isin(np.arange(0,5)).astype(int)
    df['ms_school'] = df['scholar_grade'].isin(np.arange(5,6)).astype(int)
    df['log_commute'] = np.log(df['commute_time'])
    df['log_commute_square'] = df['log_commute'] ** 2
    df['log_commute_third'] = df['log_commute'] ** 3
    df = df[['intercept','yield','uniform_ordered','accepted_first_rank','had_enrolled_sib','ell_status','homeless_status','es_school','ms_school','orientation_rsvp','virtual_event_attended','in_person_event_attended',
            'scholar_grade','commute_time',
            'log_commute','log_commute_square','log_commute_third',
            'school','utm_source_bucketing']]
    df = df.dropna()
    return df

def _retrieve_nearest_census_tract_numbers(df, local_mode=''):
    # retrive census tract information from redshift
    df_census = aws_helper_functions.read_from_redshift('SELECT * FROM raw_data_science.raw_commute_census_tracts_lat_long', local_mode=local_mode)
    # Create a BallTree for the census tract latitudes and longitudes
    tree = BallTree(df_census[['lat_orig', 'long_orig']].values, leaf_size=40)
    
    #drop rows in df where students_home__latitude__s or students_home__longitude__s is null
    #df = df.dropna(subset=['latitude', 'longitude'])
    df['latitude'] = np.where(df['latitude'].isna()==True, 40.776676, df['latitude'])
    df['longitude'] = np.where(df['longitude'].isna()==True, -73.971321, df['longitude'])
    
    distances, indices = tree.query(df[['latitude', 'longitude']].values, k=1)
    df.loc[:, 'boro_int'] = df_census.loc[indices.flatten(), 'boro_int'].values.copy()
    df.loc[:, 'census_tract_int'] = df_census.loc[indices.flatten(), 'census_tract_int'].values.copy()
    return df

def _get_commute_time(df, local_mode=''):
    #retrieve school-census tract commute times from redshift
    df_commutes = aws_helper_functions.read_from_redshift('SELECT * FROM raw_data_science.raw_commute_census_tracts_to_schools', local_mode=local_mode)
    
    schools = _get_schools()
    df_commutes = _replace_with_keys(df_commutes, 'school', schools)
    #set df_commutes.time_walking_min, time_transit_min, and time_driving_min to numeric
    df_commutes['time_walking_min'] = pd.to_numeric(df_commutes['time_walking_min'], errors='coerce')
    df_commutes['time_transit_min'] = pd.to_numeric(df_commutes['time_transit_min'], errors='coerce')
    df_commutes['time_driving_min'] = pd.to_numeric(df_commutes['time_driving_min'], errors='coerce')
    
    df_commutes['commute_time'] = df_commutes[['time_walking_min', 'time_transit_min']].min(axis=1)
    df_commutes = df_commutes[['boro_int', 'census_tract_int', 'school', 'commute_time']]
    
    df = df.merge(df_commutes, how='left', on=['boro_int', 'census_tract_int', 'school'])
    df['commute_time'] = np.where(df['commute_time']>120,30,df['commute_time'])
    return df

def _get_schools():
    schools = {
        'SA Bed-Stuy 2': 'BED-STUY2',
        'SA Bed-Stuy': 'BED-STUY2',
        'SA Bed-Stuy Middle School': 'BED-STUY_MIDDLE_SCHOOL',
        'SA Bensonhurst': 'BENSONHURST',
        'SA Bergen Beach':'BERGEN_BEACH',
        'SA Bronx 1 Middle School': 'BRONX1',
        'SA Bronx 1': 'BRONX1',
        'SA Bronx Middle School': 'BRONX_MIDDLE_SCHOOL',
        'SA Bronx 2': 'BRONX2',
        'SA Bronx 2 Middle School': 'BRONX2_MIDDLE_SCHOOL',
        'SA Bronx 3': 'BRONX3',
        'SA Bronx 4': 'BRONX4',
        'SA Bronx 5': 'BRONX5',
        'SA Bronx 5 Upper': 'BRONX5',
        'SA Bronx 5 Lower': 'BRONX5',
        'SA Bushwick': 'BUSHWICK',
        'SA Cobble Hill': 'COBBLE_HILL',
        'SA Crown Heights': 'CROWN_HEIGHTS',
        'SA Ditmas Park Middle School': 'DITMAS_PARK_MIDDLE_SCHOOL',
        'SA East Flatbush Middle School': 'EAST_FLATBUSH_MIDDLE_SCHOOL',
        'SA Far Rockaway': 'FAR_ROCKAWAY',
        'SA Far Rockaway Middle School': 'FAR_ROCKAWAY_MIDDLE_SCHOOL',
        'SA Flatbush': 'FLATBUSH',
        'SA Hamilton Heights Middle School': 'HARLEM6',
        'SA Harlem 1': 'HARLEM1',
        'SA Harlem 2': 'HARLEM2',
        'SA Harlem 3': 'HARLEM3',
        'SA Harlem 4': 'HARLEM4',
        'SA Harlem 5': 'HARLEM5',
        'SA Harlem 6': 'HARLEM6',
        'SA Harlem East': 'HARLEM_EAST',
        'SA Harlem East Middle School': 'HARLEM_EAST',
        'SA Harlem North Central': 'HARLEM_NORTH_CENTRAL',
        'SA Harlem North Central Middle School': 'HARLEM_NORTH_CENTRAL',
        'SA Harlem West': 'HARLEM_WEST',
        'SA Harlem West Middle School': 'HARLEM_WEST',
        'SA Harlem North West': 'HARLEM_NORTH_WEST',
        'SA Harlem North West Middle School': 'HARLEM_NORTH_WEST',
        'SA Hells Kitchen': 'HELLS_KITCHEN',
        'SA Hell\'s Kitchen': 'HELLS_KITCHEN',
        'SA High School of the Liberal Arts - Manhattan': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_MANHATTAN',
        'SA High School of the Liberal Arts-Manhattan': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_MANHATTAN',
        'SA High School of the Liberal Arts - Harlem': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_HARLEM',
        'SA High School of the Liberal Arts-Harlem': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_HARLEM',
        'SA High School of the Liberal Arts - Brooklyn': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_BROOKLYN',
        'SA High School of the Liberal Arts-Brooklyn': 'HIGH_SCHOOL_OF_THE_LIBERAL_ARTS_-_BROOKLYN',
        'SA Hudson Yards': 'HUDSON_YARDS',
        'SA Hudson Yards Middle School': 'HUDSON_YARDS_MIDDLE_SCHOOL',
        'SA Kingsbridge Heights': 'KINGSBRIDGE_HEIGHTS',
        'SA Lafayette Middle School': 'LAFAYETTE_MIDDLE_SCHOOL',
        'SA Midtown West Middle School': 'MIDTOWN_WEST',
        'SA Myrtle Middle School': 'MYRTLE_MIDDLE_SCHOOL',
        'SA Norwood': 'NORWOOD',
        'SA Ozone Park Middle School': 'OZONE_PARK_MIDDLE_SCHOOL',
        'SA Prospect Heights': 'PROSPECT_HEIGHTS',
        'SA Queens Village': 'QUEENS_VILLAGE',
        'SA Rosedale': 'ROSEDALE',
        'SA Rockaway Park Middle School': 'ROCKAWAY_PARK_MIDDLE_SCHOOL',
        'SA South Jamaica': 'SOUTH_JAMAICA',
        'SA Sheepshead Bay': 'SHEEPSHEAD_BAY',
        'SA Springfield Gardens Middle School': 'SPRINGFIELD_GARDENS',
        'SA Springfield Gardens MS': 'SPRINGFIELD_GARDENS',
        'SA Springfield Gardens': 'SPRINGFIELD_GARDENS',
        'SA Union Square': 'UNION_SQUARE',
        'SA Upper West': 'UPPER_WEST',
        'SA Washington Heights': 'WASHINGTON_HEIGHTS',
        'SA Williamsburg': 'WILLIAMSBURG',
        }
    return schools

def _replace_with_keys(df, column, dictionary):
    new_df = pd.DataFrame()
    for key, value in dictionary.items():
        temp_df = df[df[column] == value].copy()
        temp_df[column] = key
        new_df = pd.concat([new_df, temp_df])
    return new_df

def set_predictors(current_predictors):
    predictors = current_predictors
    return predictors

def set_target(current_target):
    target = current_target
    return target

def setup_training_data(df, predictors, target):
    X = df[predictors]
    Y = df[target]
    return X, Y

def test_train_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
    return X_train, X_test, y_train, y_test

def features_to_array(df, num_features=num_features):
    array = np.array(df, np.float32).reshape([-1, num_features])
    return array

def classes_to_array(df, num_features=num_features):
    array = np.array(df, np.float32).reshape([-1, num_classes])
    return array

def array_to_tensor(x_train, y_train, batch_size = batch_size):
    train_data=tf.data.Dataset.from_tensor_slices((x_train,y_train)).repeat().shuffle(5000).batch(batch_size).prefetch(1)
    return train_data

def set_weight(num_features = num_features, num_classes = num_classes):
    W = tf.Variable(tf.ones([num_features, num_classes]), name="weight")
    return W

def set_bias(num_classes = num_classes):
    b = tf.Variable(tf.zeros([num_classes]), name="bias")
    return b

# Logistic regression (Wx + b).
def logistic_regression(x):
    # Apply softmax to normalize the logits to a probability distribution.
    return tf.nn.softmax(tf.matmul(x, W) + b)
# Cross-Entropy loss function.ß

def cross_entropy(y_pred, y_true):
    # Encode label to a one hot vector.
    # Clip prediction values to avoid log(0) error.
    y_pred = tf.clip_by_value(y_pred, 1e-9, 1.)
    # Compute cross-entropy.
    return tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred)))

# Accuracy metric.
def accuracy(y_pred, y_true):
# Predicted class is the index of the highest score in prediction vector (i.e. argmax).
    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
    return tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Stochastic gradient descent optimizer.
optimizer = tf.optimizers.SGD(learning_rate)

# Optimization process. 
def run_optimization(x, y):
# Wrap computation inside a GradientTape for automatic differentiation.
    with tf.GradientTape() as g:
        pred = logistic_regression(x)
        loss = cross_entropy(pred, y)
    # Compute gradients.
    gradients = g.gradient(loss, [W, b])
    # Update W and b following gradients.
    optimizer.apply_gradients(zip(gradients, [W, b]))

In [21]:
# Get data
df = _get_table_data('train_basetable.csv')

# Set target and predictors
target = set_target(['yield'])
predictors = set_predictors(['uniform_ordered','accepted_first_rank','had_enrolled_sib',
                            'orientation_rsvp','virtual_event_attended','in_person_event_attended',
                            'scholar_grade','commute_time',
                            'utm_source_bucketing','school'])

# Prepare data
X, y =  setup_training_data(df, predictors, target)
X = pd.get_dummies(X, columns=['utm_source_bucketing','school'])
x_train, x_test, y_train, y_test =  test_train_split(X, y)
x_train = features_to_array(x_train)
x_test = features_to_array(x_test)
y_train = classes_to_array(y_train)
y_test = classes_to_array(y_test)
train_data = array_to_tensor(x_train, y_train)

# Set weight and bias
W = set_weight()
b = set_bias()

In [22]:
# Run training for the given number of steps.
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
    run_optimization(batch_x, batch_y)
    if step % display_step == 0:
        pred = logistic_regression(batch_x)
        loss = cross_entropy(pred, batch_y)
        acc = accuracy(pred, batch_y)
        print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc))

step: 50, loss: -0.000000, accuracy: 0.745000
step: 100, loss: -0.000000, accuracy: 0.690000
step: 150, loss: -0.000000, accuracy: 0.705000
step: 200, loss: -0.000000, accuracy: 0.705000
step: 250, loss: -0.000000, accuracy: 0.745000
step: 300, loss: -0.000000, accuracy: 0.705000
step: 350, loss: -0.000000, accuracy: 0.715000
step: 400, loss: -0.000000, accuracy: 0.715000
step: 450, loss: -0.000000, accuracy: 0.730000
step: 500, loss: -0.000000, accuracy: 0.735000
step: 550, loss: -0.000000, accuracy: 0.770000
step: 600, loss: -0.000000, accuracy: 0.735000
step: 650, loss: -0.000000, accuracy: 0.745000
step: 700, loss: -0.000000, accuracy: 0.755000
step: 750, loss: -0.000000, accuracy: 0.690000
step: 800, loss: -0.000000, accuracy: 0.745000
step: 850, loss: -0.000000, accuracy: 0.770000
step: 900, loss: -0.000000, accuracy: 0.705000
step: 950, loss: -0.000000, accuracy: 0.755000
step: 1000, loss: -0.000000, accuracy: 0.745000


2024-04-02 14:31:15.283345: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [23]:
# Test model on validation set.
pred = logistic_regression(x_test)
print("Test Accuracy: %f" % accuracy(pred, y_test))

Test Accuracy: 0.737912
