In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, OneHotEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from xgboost import XGBRegressor



In [21]:
x = pd.read_csv('train_values.csv', index_col='row_id')
test = pd.read_csv('test_values.csv', index_col='row_id')
y = pd.read_csv('train_labels.csv', index_col='row_id')
x.drop('report_year', axis=1, inplace=True)

In [3]:
aid_clusters = ['aid__cumulative_debt_75th_percentile', 
                'aid__cumulative_debt_90th_percentile',
                'aid__median_debt_completers_monthly_payments', 
                'aid__median_debt_completers_overall']

cost_clusters = ['cost__attendance_academic_year',
                'cost__tuition_in_state']

kmeans = KMeans(n_clusters=2, random_state=0)

temp = x.loc[:, aid_clusters]
temp.fillna(temp.mean(), inplace=True)
kmeans.fit(temp)
x['custom__aid_clusters'] = kmeans.labels_ 

temp = x.loc[:, cost_clusters]
temp.fillna(temp.mean(), inplace=True)
kmeans.fit(temp)
x['custom__cost_clusters'] = kmeans.labels_

In [4]:
x['custom__academics_num_bach'] = x.filter(like='_bach').replace(0,np.nan).sum(axis=1)
x['custom__school_campus'] = x.school__main_campus.isin(['Main campus']).astype('int')
x['custom__religious_high'] = x.school__religious_affiliation.isin(['myw','cxp','pdf','aiy','thg','zug','sdh','bmv','fxo','qys',
                                             'nnm','onn','ibe','ntl','smi','aai','huu','mix','dpu','fuf','dqz','hmn',
                                             'xds','qzo','mky','hap','fiy','gju','lrj','emi','ddx','jqf']).astype('int')
x['custom__religious_low'] = x.school__religious_affiliation.isin(['wxa', 'prn', 'qyb', 'nhu','uac', 'rgp', 'iqp',]).astype('int')
x['custom__school_state_low'] = x.school__state.isin(['tus', 'nni','noz','ugr','aku','kta','qbv','iju','msx','qid','fen','bbk','sbh','uod',
                                           'gai','idl','gzi','xfa','qua','yyg','xtb','dlg','pgp','krj','bxo','zms','ste',]).astype('int')
x['custom__school_state_high'] = x.school__state.isin(['prq', 'mig', 'tdb', 'iya', 'wzk', 'afu', 'iyc', 'exw', 'npw', 'rmt','jor','cyf','cmn','ncw','usz',
                                           'tlt','kho','xhl','dhx','nja','ony','rbl','xgy','fyo','das','fjm','hgy']).astype('int')
x['custom__school_degrees_awarded_graduate'] = x.school__degrees_awarded_highest.isin(['Graduate degree']).astype('int')
x['custom__school_region_low'] = x.school__region_id.isin(['Southwest (AZ, NM, OK, TX)', 
                                 'Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)',
                                 'Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)',
                                ]).astype('int')
x['custom__school_region_high'] = x.school__region_id.isin(['New England (CT, ME, MA, NH, RI, VT)',
                                'Mid East (DE, DC, MD, NJ, NY, PA)',
                                'Plains (IA, KS, MN, MO, NE, ND, SD)',
                                'U.S. Service Schools']).astype('int')
x['custom__carnegie_undergrad_low'] = x.school__carnegie_undergrad.isin(['Four-year, full-time, inclusive, lower transfer-in', 
                                         'Four-year, higher part-time',
                                         'Four-year, medium full-time, inclusive, lower transfer-in',
                                         'Two-year, medium full-time',
                                         'Two-year, higher full-time',
                                         'Not applicable',
                                         'Two-year, mixed part/full-time',
                                         'Four-year, medium full-time, inclusive, higher transfer-in',
                                         'Two-year, higher part-time',]).astype('int')
x['custom__carnegie_undergrad_high'] = x.school__carnegie_undergrad.isin(['Four-year, full-time, more selective, lower transfer-in',
                                         'Four-year, full-time, more selective, higher transfer-in',
                                         'Four-year, full-time, selective, higher transfer-in',
                                         'Four-year, full-time, selective, lower transfer-in',
                                         'Four-year, medium full-time, selective, lower transfer-in',
                                         'Four-year, medium full-time, selective, higher transfer-in',
                                         'Not classified (Exclusively Graduate)']).astype('int')
x['custom__school_carnegie_size_setting_high'] = x.school__carnegie_size_setting.isin(['Four-year, large, highly residential',
                                             'Four-year, large, primarily residential',
                                             'Four-year, medium, highly residential',
                                             'Four-year, medium, primarily residential',
                                             'Four-year, small, highly residential',
                                             'Exclusively graduate/professional',
                                             'Four-year, small, primarily residential'
                                         ]).astype('int')
x['custom__school_carnegie_size_setting_low'] = x.school__carnegie_size_setting.isin(['Four-year, very small, primarily nonresidential',
                                             'Not applicable',
                                             'Two-year, large',
                                             'Two-year, medium',
                                             'Two-year, small',
                                             'Two-year, very large',
                                             'Two-year, very small'
                                         ]).astype('int')
x['custom__carnegie_basic_high'] = x.school__carnegie_basic.isin(['Baccalaureate Colleges: Arts & Sciences Focus',
                                      'Doctoral Universities: Highest Research Activity',
                                      "Master's Colleges & Universities: Larger Programs",
                                      'Doctoral Universities: Higher Research Activity',
                                      "Master's Colleges & Universities: Medium Programs",
                                      'Special Focus Four-Year: Medical Schools & Centers',
                                      "Master's Colleges & Universities: Small Programs",
                                      "Doctoral Universities: Moderate Research Activity",
                                      "Special Focus Four-Year: Law Schools",
                                      "Special Focus Four-Year: Engineering Schools",                                      
                                         ]).astype('int')
x['custom__carnegie_basic_low'] = x.school__carnegie_basic.isin(['Not applicable',
                                      "Baccalaureate/Associate's Colleges: Mixed Baccalaureate/Associate's",
                                      "Special Focus Two-Year: Health Professions",
                                      "Baccalaureate/Associate's Colleges: Associate's Dominant",
                                      "Associate's Colleges: High Vocational & Technical-High Nontraditional",
                                      "Special Focus Four-Year: Other Technology-Related Schools",
                                      "Associate's Colleges: High Vocational & Technical-Mixed Traditional/Nontraditional",
                                      "Special Focus Two-Year: Other Fields",
                                      "Special Focus Four-Year: Business & Management Schools",
                                      "Associate's Colleges: Mixed Transfer/Vocational & Technical-Mixed Traditional/Nontraditional"
                                     ]).astype('int')
x['custom__school_degrees_predominant_high'] = x.school__degrees_awarded_predominant.isin(["Predominantly bachelor's-degree granting",
                                                  "Entirely graduate-degree granting"
                                                  ]).astype('int')
x['custom__school_institutional_characteristics_level'] = x.school__institutional_characteristics_level.isin(['4-year']).astype('int')
x['custom__school_ownership'] = x.school__ownership.isin(["Private nonprofit", "Public"]).astype('int')
x['custom__school_online'] = x.school__online_only.isin(['Not distance-education only']).astype('int')
x['custom__school_locale'] = x.school__locale.isin(['Suburb: Large (outside principal city, in urbanized area with population of 250,000 or more)',
                                            'City: Large (population of 250,000 or more)',
                                            'City: Midsize (population of at least 100,000 but less than 250,000)',
                                            'Rural: Fringe (rural territory up to 5 miles from an urbanized area or up to 2.5 miles from an urban cluster)']).astype('int')

In [5]:
log_cols = ['cost__title_iv_private_by_income_level_75001_110000',
          'cost__title_iv_public_by_income_level_110001_plus',
          'completion__completion_cohort_4yr_100nt',
          'cost__title_iv_private_by_income_level_75001_110000',       
           ]
for col in log_cols:
    x['custom__log_{}'.format(col)] = x[col].map(lambda x: np.log(x+1))

In [6]:
log_percentage = ['academics__program_percentage_english', 'academics__program_percentage_history', 
                  'academics__program_percentage_social_science', 'student__demographics_veteran']
for col in log_percentage:
    x['custom__log_{}'.format(col)] = x[col].map(lambda x: np.log(x*100+1))

In [37]:
x.filter(like='scores').fillna(0).isnull().any()

admissions__act_scores_25th_percentile_cumulative          False
admissions__act_scores_25th_percentile_english             False
admissions__act_scores_25th_percentile_math                False
admissions__act_scores_25th_percentile_writing             False
admissions__act_scores_75th_percentile_cumulative          False
admissions__act_scores_75th_percentile_english             False
admissions__act_scores_75th_percentile_math                False
admissions__act_scores_75th_percentile_writing             False
admissions__act_scores_midpoint_cumulative                 False
admissions__act_scores_midpoint_english                    False
admissions__act_scores_midpoint_math                       False
admissions__act_scores_midpoint_writing                    False
admissions__sat_scores_25th_percentile_critical_reading    False
admissions__sat_scores_25th_percentile_math                False
admissions__sat_scores_25th_percentile_writing             False
admissions__sat_scores_75

In [34]:
components = 1


pipe = Pipeline([#('imputer', Imputer(strategy='median')),
                 ('scaler', RobustScaler()),
                 ('pca', PCA(n_components=components))])

temp = x.filter(like='_scores_').fillna(0)
pipe.fit(temp)
temp = pipe.transform(temp)

from scipy import stats
for i in range(components):
    print(stats.pearsonr(temp[:,i], y['repayment_rate']))
    
x['custom__scores_PCA'] = temp[:,0] #pearson r: 0.3299

#x.drop(list(x.filter(like='_scores_')), axis=1, inplace=True)

(0.58615401489475116, 0.0)


In [39]:
stats.pearsonr(x['admissions__act_scores_midpoint_cumulative'].fillna(0), y['repayment_rate'])

(0.56670670187854411, 0.0)

In [9]:
bachelors = ['biological', 'communication', 'computer', 'education', 'health', 'history', 'mathematics', 'humanities', 'language', 
'multidiscipline', 'philosophy_religious', 'physical_science', 'psychology', 'social_science', 'visual_performing']
bachelors = ['academics__program_bachelors_{}'.format(i) for i in bachelors]
temp = x.loc[:,bachelors] == 1
x['custom__academics_program_bachelors_good'] = temp.sum(axis=1)

x['custom__academics_program_percentage_health'] = x['academics__program_percentage_health'].replace(1, np.nan)
x['custom__academics_program_bachelors_business_0'] = x.academics__program_bachelors_business_marketing.isin([1]).astype('int')

x['custom__cost'] = x['cost__tuition_out_of_state'].fillna(x['cost__tuition_out_of_state'].mean())
x.drop(list(x.filter(like='cost__')), axis=1, inplace=True)

x['custom__student__share_low_income'] = x['student__share_lowincome_0_30000'].fillna(x['student__share_lowincome_0_30000'].mean())
x['custom__student_pell_grant'] = x['student__students_with_pell_grant'].fillna(x['student__students_with_pell_grant'].mean())

In [10]:
x.replace(['Yes', 'No'], [1,0], inplace=True)
x.replace(['Main campus', 'Not main campus'], [1,0], inplace=True)
#x.drop('school__men_only', axis=1, inplace=True)
#x.drop('school__women_only', axis=1, inplace=True)
#x.drop('school__main_campus', axis=1, inplace=True)
#x.drop('school__religious_affiliation', axis=1, inplace=True)
#x.drop('school__degrees_awarded_highest', axis=1, inplace=True)
#x.drop('school__region_id', axis=1, inplace=True)
#x.drop('school__carnegie_undergrad', axis=1, inplace=True)
#x.drop('school__carnegie_size_setting', axis=1, inplace=True)
#x.drop('school__carnegie_basic', axis=1, inplace=True)
#x.drop('school__state', axis=1, inplace=True)
#x.drop('school__degrees_awarded_predominant', axis=1, inplace=True)
#x.drop('school__institutional_characteristics_level', axis=1, inplace=True)
#x.drop('school__ownership', axis=1, inplace=True)
#x.drop('school__online_only', axis=1, inplace=True)                                          
#x.drop('school__locale', axis=1, inplace=True)
#x.drop('academics__program_bachelors_business_marketing', axis=1, inplace=True)
#x.drop(list(x.filter(like='academcics__program_bach')), axis=1, inplace=True)
x.drop(list(x.filter(like='academics__program_as')), axis=1, inplace=True)
x.drop(list(x.filter(like='academics__program_cert')), axis=1, inplace=True)
#x.drop(list(x.filter(like='academics__program_percentage')), axis=1, inplace=True)

In [11]:
df = pd.get_dummies(x, dummy_na=True)

In [12]:
print(x.shape)
print(df.shape)

(8705, 289)
(8705, 505)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(df, y['repayment_rate'], random_state=80) 

In [14]:
pipe = Pipeline([('imputer', Imputer(strategy='median')),
                 ('regressor', XGBRegressor(n_estimators=1000, max_depth=8, nthread=4))])
                      
pipe.fit(X_train, y_train)

print('Training Set Score: {:.3f}'.format(pipe.score(X_train, y_train)))
print('Test Set Score: {:.3f}'.format(pipe.score(X_test, y_test)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, pipe.predict(X_test)))))

KeyboardInterrupt: 

In [None]:
pipe = Pipeline([('imputer', Imputer(strategy='median')),
                 ('regressor', XGBRegressor(n_estimators=1000, nthread=4))])
                      

param_grid = {'regressor__max_depth':[8, 10, 13, 15]}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)

grid.fit(X_train, y_train)

print('Best CV Accuracy: {:.3f}'.format(grid.best_score_))
print('Training Set Score: {:.3f}'.format(grid.score(X_train, y_train)))
print('Test Set Score: {:.3f}'.format(grid.score(X_test, y_test)))
print('Best_Parameters: {}'.format(grid.best_params_))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, grid.predict(X_test)))))