In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.stats

from collections import Counter
import warnings # Turn off warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from statistics import mean
from statistics import stdev
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, cross_val_score, KFold

In [2]:
# Most of these were guesses, reverse and add_log did not make it into the final model

def reverse(data_df):
    data_df['GPA_rev'] = data_df['GPA'].apply(lambda x: 4-x)
    data_df['GRE_Verbal_rev'] = data_df['GRE_Verbal'].apply(lambda x: 170-x)
    data_df['GRE_Quantitative_rev'] = data_df['GRE_Quantitative'].apply(lambda x: 170-x)
    data_df['GRE_Writing_rev'] = data_df['GRE_Writing'].apply(lambda x: 5-x)
    data_df['GRE_Subject_rev'] = data_df['GRE_Subject'].apply(lambda x: 990-x)
    data_df['Decision_rev'] = data_df['Decision'].apply(lambda x: 1-x)
    return data_df

def add_log(data_df):
    data_df['GPA_log'] = data_df['GPA_rev'].apply(lambda x:  math.log(x+1, 10))
    data_df['GRE_Verbal_log'] = data_df['GRE_Verbal_rev'].apply(lambda x: math.log(x+1, 10))
    data_df['GRE_Quantitative_log'] = data_df['GRE_Quantitative_rev'].apply(lambda x: math.log(x+1, 10))
    data_df['GRE_Writing_log'] = data_df['GRE_Writing_rev'].apply(lambda x: math.log(x+1, 10))
    data_df['GRE_Subject_log'] = data_df['GRE_Subject_rev'].apply(lambda x: math.log(x+1, 10))
    return data_df

def add_sq(data_df):
    data_df['GPA_sq'] = data_df['GPA'].apply(lambda x: x**2)
    data_df['GRE_Verbal_sq'] = data_df['GRE_Verbal'].apply(lambda x: x**2)
    data_df['GRE_Quantitative_sq'] = data_df['GRE_Quantitative'].apply(lambda x: x**2)
    data_df['GRE_Writing_sq'] = data_df['GRE_Writing'].apply(lambda x: x**2)
    data_df['GRE_Subject_sq'] = data_df['GRE_Subject'].apply(lambda x: x**2)
    return data_df

def add_pc(data_df):
    data_df['GRE_Verbal_pc'] = data_df['GRE_Verbal'].apply(lambda x: scipy.stats.norm(150.05, 8.43).cdf(x))
    data_df['GRE_Quantitative_pc'] = data_df['GRE_Quantitative'].apply(lambda x: scipy.stats.norm(152.80, 9.13).cdf(x))
    data_df['GRE_Writing_pc'] = data_df['GRE_Writing'].apply(lambda x: scipy.stats.norm(3.5, 0.87).cdf(x))
    data_df['GRE_Subject_pc'] = data_df['GRE_Subject'].apply(lambda x: scipy.stats.norm(712, 158).cdf(x))
    return data_df

def add_cross(data_df):
    data_df['GREVxGREQ'] = data_df['GRE_Verbal']**(0.5) * data_df['GRE_Quantitative']**(0.5)
    data_df['GREVxGRES'] = data_df['GRE_Verbal']**(0.5) * data_df['GRE_Subject']**(0.5)
    data_df['GREQxGRES'] = data_df['GRE_Quantitative']**(0.5) * data_df['GRE_Subject']**(0.5)
    data_df['GREVxGREQxGRES'] = data_df['GRE_Verbal']**(1/3) * data_df['GRE_Quantitative']**(1/3) * data_df['GRE_Subject']**(1/3)
    return data_df

In [3]:
# Import
physics_df = pd.read_csv('data/physics-df.csv')
physics_df = physics_df.drop(columns = ["Unnamed: 0"])
physics_subset_df = physics_df[['GPA', 'GRE_Verbal', 'GRE_Quantitative', 'GRE_Writing', 'GRE_Subject', 'American', 'Papers', 'Research', 'Decision']] # Remove unwanted columns

In [4]:
# Add features
physics_subset_df = reverse(physics_subset_df)
physics_subset_df = add_log(physics_subset_df)
physics_subset_df = add_sq(physics_subset_df)
physics_subset_df = add_pc(physics_subset_df)
physics_subset_df = add_cross(physics_subset_df)
physics_subset_df

Unnamed: 0,GPA,GRE_Verbal,GRE_Quantitative,GRE_Writing,GRE_Subject,American,Papers,Research,Decision,GPA_rev,...,GRE_Writing_sq,GRE_Subject_sq,GRE_Verbal_pc,GRE_Quantitative_pc,GRE_Writing_pc,GRE_Subject_pc,GREVxGREQ,GREVxGRES,GREQxGRES,GREVxGREQxGRES
0,3.77,154.0,167.0,3.0,880.0,0,0.0,0.0,1,0.23,...,9.00,774400.0,0.680310,0.940064,0.282743,0.856175,160.368326,368.130412,383.353623,282.861140
1,3.63,156.0,163.0,4.0,570.0,1,0.0,0.0,0,0.37,...,16.00,324900.0,0.759848,0.868045,0.717257,0.184397,159.461594,298.194567,304.811417,243.816085
2,3.20,158.0,159.0,4.0,610.0,1,1.0,0.0,0,0.80,...,16.00,372100.0,0.827175,0.751456,0.717257,0.259279,158.499211,310.451284,311.432176,248.386516
3,3.58,159.0,160.0,4.0,610.0,1,1.0,1.0,0,0.42,...,16.00,372100.0,0.855811,0.784830,0.717257,0.259279,159.499216,311.432176,312.409987,249.430169
4,3.10,162.0,163.0,3.0,880.0,0,0.0,0.0,0,0.90,...,9.00,774400.0,0.921840,0.868045,0.282743,0.856175,162.499231,377.571185,378.734736,285.361315
5,3.94,164.0,163.0,5.0,750.0,1,0.0,0.0,0,0.06,...,25.00,562500.0,0.951018,0.868045,0.957659,0.595031,163.499235,350.713558,349.642675,271.663258
6,3.97,159.0,168.0,4.0,950.0,1,0.0,1.0,0,0.03,...,16.00,902500.0,0.855811,0.952028,0.717257,0.934009,163.438062,388.651515,399.499687,293.861937
7,3.90,159.0,168.0,4.0,970.0,0,0.0,0.0,0,0.10,...,16.00,940900.0,0.855811,0.952028,0.717257,0.948756,163.438062,392.721275,403.683044,295.909822
8,3.90,162.0,162.0,4.0,850.0,1,0.0,0.0,0,0.10,...,16.00,722500.0,0.921840,0.843193,0.717257,0.808782,162.000000,371.079506,371.079506,281.502972
9,3.93,158.0,170.0,4.0,980.0,0,0.0,0.0,0,0.07,...,16.00,960400.0,0.827175,0.970211,0.717257,0.955076,163.890207,393.497141,408.166633,297.470585


In [6]:
print(physics_subset_df.columns)

Index(['GPA', 'GRE_Verbal', 'GRE_Quantitative', 'GRE_Writing', 'GRE_Subject',
       'American', 'Papers', 'Research', 'Decision', 'GPA_rev',
       'GRE_Verbal_rev', 'GRE_Quantitative_rev', 'GRE_Writing_rev',
       'GRE_Subject_rev', 'Decision_rev', 'GPA_log', 'GRE_Verbal_log',
       'GRE_Quantitative_log', 'GRE_Writing_log', 'GRE_Subject_log', 'GPA_sq',
       'GRE_Verbal_sq', 'GRE_Quantitative_sq', 'GRE_Writing_sq',
       'GRE_Subject_sq', 'GRE_Verbal_pc', 'GRE_Quantitative_pc',
       'GRE_Writing_pc', 'GRE_Subject_pc', 'GREVxGREQ', 'GREVxGRES',
       'GREQxGRES', 'GREVxGREQxGRES'],
      dtype='object')


In [8]:
physics_subset_df.to_csv('data/physics-ft-df.csv') # Save point