# Predictive Analytics (Regression) scenario involving Student Income

This is a predictive analytics (regression) example where we are predicting student income based on various characterists from Student Income database. 

## Data Transformation


### Import Libraries

In [1]:
# Import and alias Pandas
import pandas as pd
import numpy as np
import matplotlib

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import display, HTML
from sklearn import preprocessing

from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy import stats
from sklearn import model_selection

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


Code below to load up the training dataset. You can store it into a variable called df:

In [2]:
# .. importing csv

data_train = pd.read_csv("data-release/data/final/public/train_values.csv")
label_train = pd.read_csv("data-release/data/final/public/train_labels.csv")
data_test = pd.read_csv("data-release/data/final/public/test_values.csv")

df = data_train
df['income'] = label_train['income']

#display(df.head(50))


In [3]:
#display(data_test.count().sort_values(ascending=False))

In [4]:
display(data_test.sum().sort_values(ascending=False))

report_year                                                                    year_fyear_wyear_ayear_ayear_fyear_ayear_zyear...
school__state                                                                  dfyryavvibwwporslpqimdfypxvhbtshiidwahhhbtvviv...
school__ownership                                                              PublicPublicPrivate nonprofitPrivate for-profi...
school__main_campus                                                            Main campusMain campusMain campusMain campusNo...
school__degrees_awarded_highest                                                Graduate degreeGraduate degreeBachelor's degre...
school__region_id                                                              Far West (AK, CA, HI, NV, OR, WA)Mid East (DE,...
school__degrees_awarded_predominant                                            Entirely graduate-degree grantingPredominantly...
school__institutional_characteristics_level                                    4-year4-year4-year

Listing all features sorted by how many NULL values they contain

In [5]:
display(data_test.isnull().sum().sort_values())

row_id                                                                            0
report_year                                                                       0
school__degrees_awarded_highest                                                   0
school__degrees_awarded_predominant                                               0
school__degrees_awarded_predominant_recoded                                       0
school__institutional_characteristics_level                                       0
school__main_campus                                                               0
school__ownership                                                                 0
school__state                                                                     0
school__region_id                                                                 0
student__demographics_age_entry                                                  23
student__share_independent_students                                         

### Data Tranformation in Training and Validation Set

In [78]:
#display(data_train.shape)
#display(data_train[['student__demographics_first_generation','student__share_firstgeneration']])

df['cost__tuition_program_year'].fillna(df['cost__tuition_program_year'].mean(), inplace=True)
df['cost__tuition_in_state'].fillna(df['cost__tuition_program_year'], inplace=True)
df['cost__tuition_out_of_state'].fillna(df['cost__tuition_program_year'], inplace=True)
df['school__faculty_salary'].fillna(df['school__faculty_salary'].mean(), inplace=True)
df['student__size'].fillna(df['student__size'].mean(), inplace=True)
df['school__tuition_revenue_per_fte'].fillna(df['school__tuition_revenue_per_fte'].mean(), inplace=True)
df['student__demographics_age_entry'].fillna(df['student__demographics_age_entry'].mean(), inplace=True)
df['school__ft_faculty_rate'].fillna(df['school__ft_faculty_rate'].mean(), inplace=True)



#df['school__ft_faculty_rate'].fillna(df['school__ft_faculty_rate'].mean(), inplace=True)
#df['student__demographics_age_entry'].fillna(df['student__demographics_age_entry'].mean(), inplace=True)
#df['student__share_firstgeneration_parents_highschool'].fillna(df['student__share_firstgeneration_parents_highschool'].mean(), inplace=True)

#df['school__instructional_expenditure_per_fte'].fillna(df['school__instructional_expenditure_per_fte'].mean(), inplace=True)
#df['student__size'].fillna(df['student__size'].mean(), inplace=True)    

#df['admissions__act_scores_25th_percentile_cumulative'].fillna(df['admissions__act_scores_25th_percentile_cumulative'].mean(), inplace=True)
#df['admissions__act_scores_75th_percentile_cumulative'].fillna(df['admissions__act_scores_75th_percentile_cumulative'].mean(), inplace=True)
#df['admissions__act_scores_midpoint_cumulative'].fillna(df['admissions__act_scores_midpoint_cumulative'].mean(), inplace=True)
#df['admissions__admission_rate_by_ope_id'].fillna(df['admissions__admission_rate_by_ope_id'].mean(), inplace=True)
#df['admissions__sat_scores_average_overall'].fillna(df['admissions__sat_scores_average_overall'].mean(), inplace=True)
#df['admissions__sat_scores_midpoint_math'].fillna(df['admissions__sat_scores_midpoint_math'].mean(), inplace=True)



#df.report_year = df.report_year.astype("category",
#  categories=['year_a', 'year_f', 'year_z', 'year_w']
#).cat.codes


#df.school__region_id = df.school__region_id.astype("category",
#  categories=['Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)',
#       'New England (CT, ME, MA, NH, RI, VT)',
#       'Plains (IA, KS, MN, MO, NE, ND, SD)',
#       'Mid East (DE, DC, MD, NJ, NY, PA)',
#       'Rocky Mountains (CO, ID, MT, UT, WY)',
#       'Far West (AK, CA, HI, NV, OR, WA)',
#       'Great Lakes (IL, IN, MI, OH, WI)', 'Southwest (AZ, NM, OK, TX)',
#       'Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)']
#).cat.codes

#df.school__ownership = df.school__ownership.astype("category",
#  categories=['Private nonprofit', 'Private for-profit', 'Public']
#).cat.codes

#ordered_school__institutional_characteristics_level = ['Less-than-2-year','2-year','4-year']
#df.school__institutional_characteristics_level = df.school__institutional_characteristics_level.astype("category",
#  categories=ordered_school__institutional_characteristics_level
#).cat.codes


##ordered_school__degrees_awarded_highest = ['Non-degree-granting', 'Certificate degree','Associate degree',
 #                                          "Bachelor's degree", 'Graduate degree']
#df.school__degrees_awarded_highest = df.school__degrees_awarded_highest.astype("category",
#  categories=ordered_school__degrees_awarded_highest
#).cat.codes



## Feature Selection

In [79]:
#display(df.head(50))


#print df.isnull().sum()

#print data_test.isnull().sum()

#display(df.head(12))
#display(data_test.head(12))


In [80]:
#display(df.sum().sort_values(ascending=False))


In [81]:
#display(data_test.sum().sort_values(ascending=False))

In [82]:
df = df[[
#        'admissions__act_scores_25th_percentile_math',
        'admissions__act_scores_midpoint_math',
#        'admissions__sat_scores_25th_percentile_math',
#        'admissions__sat_scores_midpoint_math',
        'admissions__act_scores_75th_percentile_math',
        'school__degrees_awarded_highest',
        'admissions__sat_scores_75th_percentile_math',
#        'admissions__act_scores_25th_percentile_cumulative',
        'admissions__sat_scores_average_by_ope_id',
        'admissions__sat_scores_average_overall',
        'school__faculty_salary',
#        'admissions__sat_scores_25th_percentile_writing',
        'admissions__act_scores_midpoint_cumulative',
#        'admissions__sat_scores_midpoint_writing',
#        'admissions__act_scores_25th_percentile_english',                                     
        'student__share_firstgeneration_parents_somecollege',
        'admissions__act_scores_75th_percentile_cumulative',
#        'admissions__sat_scores_75th_percentile_writing',
#        'admissions__sat_scores_25th_percentile_critical_reading',
        'admissions__act_scores_midpoint_english',
        'cost__tuition_out_of_state',
#        'admissions__sat_scores_midpoint_critical_reading',  
        'admissions__act_scores_75th_percentile_english',
        'admissions__sat_scores_75th_percentile_critical_reading',
        
        'school__region_id',
        'school__tuition_revenue_per_fte',
        'report_year',
        'school__institutional_characteristics_level',
        'student__demographics_dependent',
        'student__demographics_married',
        'student__share_independent_students',
        'school__ownership', 
        'student__size',
        'school__degrees_awarded_predominant',
        'student__share_first_time_full_time',

    
        'cost__tuition_in_state',
        'student__demographics_age_entry',
        'student__demographics_female_share',
        'student__demographics_veteran',
        'student__part_time_share',

        'student__share_firstgeneration',
        'academics__program_percentage_language',
        'academics__program_percentage_humanities',
        'academics__program_percentage_history',
        'academics__program_percentage_health',
        'academics__program_percentage_family_consumer_science',
        'academics__program_percentage_ethnic_cultural_gender',
        'academics__program_percentage_english',
        'academics__program_percentage_engineering_technology',
        'academics__program_percentage_education',
        'academics__program_percentage_construction',
        'academics__program_percentage_computer',
        'academics__program_percentage_communications_technology',
        'academics__program_percentage_communication',
        'academics__program_percentage_business_marketing',
        'academics__program_percentage_biological',
        'academics__program_percentage_architecture',
        'academics__program_percentage_engineering',
        'academics__program_percentage_legal',
        'academics__program_percentage_multidiscipline',
        'academics__program_percentage_mechanic_repair_technology',
        'academics__program_percentage_visual_performing',
        'academics__program_percentage_transportation',
        'academics__program_percentage_theology_religious_vocation',
        'academics__program_percentage_social_science',
        'academics__program_percentage_security_law_enforcement',
        'academics__program_percentage_mathematics',
        'academics__program_percentage_resources',
        'academics__program_percentage_science_technology',
        'academics__program_percentage_agriculture',
        'academics__program_percentage_precision_production',
        'academics__program_percentage_physical_science',
        'academics__program_percentage_philosophy_religious',
        'academics__program_percentage_personal_culinary',
        'academics__program_percentage_parks_recreation_fitness',
        'academics__program_percentage_public_administration_social_service',
        'academics__program_percentage_psychology',
    
    
        'school__instructional_expenditure_per_fte',
        'income']]



#corr_matrix = df.corr()
#display(corr_matrix['income'].sort_values(ascending=False))


### Remove Outliers using ZScores

In [83]:
df.fillna(0, inplace=True)
print df.shape

df.drop_duplicates(inplace=True)
print df.shape

#df = df[(np.abs(stats.zscore(df)) < 5).all(axis=1)]
df = df[(np.abs(stats.zscore(df.income)) < 4.5)]

print df.shape
df = df[(np.abs(stats.zscore(df.school__faculty_salary)) < 3.5)]
print df.shape

df = df[(np.abs(stats.zscore(df.school__tuition_revenue_per_fte)) < 3.5)]
print df.shape

df = pd.get_dummies(df)
print df.shape



#print df.describe()


(17107, 70)
(17012, 70)
(16913, 70)
(16801, 70)
(16768, 70)
(16768, 93)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.



### Separating Training and Testing datasets


In [84]:
# from sklearn.cross_validation import train_test_split 

X_train, X_test, y_train, y_test = model_selection.train_test_split( df, df['income'], test_size=0.005)


display(X_train.head())

Unnamed: 0,admissions__act_scores_midpoint_math,admissions__act_scores_75th_percentile_math,admissions__sat_scores_75th_percentile_math,admissions__sat_scores_average_by_ope_id,admissions__sat_scores_average_overall,school__faculty_salary,admissions__act_scores_midpoint_cumulative,student__share_firstgeneration_parents_somecollege,admissions__act_scores_75th_percentile_cumulative,admissions__act_scores_midpoint_english,cost__tuition_out_of_state,admissions__act_scores_75th_percentile_english,admissions__sat_scores_75th_percentile_critical_reading,school__tuition_revenue_per_fte,student__demographics_dependent,student__demographics_married,student__share_independent_students,student__size,student__share_first_time_full_time,school__ft_faculty_rate,cost__tuition_in_state,student__demographics_age_entry,student__demographics_female_share,student__demographics_veteran,student__part_time_share,student__share_firstgeneration,academics__program_percentage_language,academics__program_percentage_humanities,academics__program_percentage_history,academics__program_percentage_health,academics__program_percentage_family_consumer_science,academics__program_percentage_ethnic_cultural_gender,academics__program_percentage_english,academics__program_percentage_engineering_technology,academics__program_percentage_education,academics__program_percentage_construction,academics__program_percentage_computer,academics__program_percentage_communications_technology,academics__program_percentage_communication,academics__program_percentage_business_marketing,academics__program_percentage_biological,academics__program_percentage_architecture,academics__program_percentage_engineering,academics__program_percentage_legal,academics__program_percentage_multidiscipline,academics__program_percentage_mechanic_repair_technology,academics__program_percentage_visual_performing,academics__program_percentage_transportation,academics__program_percentage_theology_religious_vocation,academics__program_percentage_social_science,academics__program_percentage_security_law_enforcement,academics__program_percentage_mathematics,academics__program_percentage_resources,academics__program_percentage_science_technology,academics__program_percentage_agriculture,academics__program_percentage_precision_production,academics__program_percentage_physical_science,academics__program_percentage_philosophy_religious,academics__program_percentage_personal_culinary,academics__program_percentage_parks_recreation_fitness,academics__program_percentage_public_administration_social_service,academics__program_percentage_psychology,school__instructional_expenditure_per_fte,income,school__degrees_awarded_highest_Associate degree,school__degrees_awarded_highest_Bachelor's degree,school__degrees_awarded_highest_Certificate degree,school__degrees_awarded_highest_Graduate degree,school__degrees_awarded_highest_Non-degree-granting,"school__region_id_Far West (AK, CA, HI, NV, OR, WA)","school__region_id_Great Lakes (IL, IN, MI, OH, WI)","school__region_id_Mid East (DE, DC, MD, NJ, NY, PA)","school__region_id_New England (CT, ME, MA, NH, RI, VT)","school__region_id_Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)","school__region_id_Plains (IA, KS, MN, MO, NE, ND, SD)","school__region_id_Rocky Mountains (CO, ID, MT, UT, WY)","school__region_id_Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)","school__region_id_Southwest (AZ, NM, OK, TX)",report_year_year_a,report_year_year_f,report_year_year_w,report_year_year_z,school__institutional_characteristics_level_2-year,school__institutional_characteristics_level_4-year,school__institutional_characteristics_level_Less-than-2-year,school__ownership_Private for-profit,school__ownership_Private nonprofit,school__ownership_Public,school__degrees_awarded_predominant_Entirely graduate-degree granting,school__degrees_awarded_predominant_Not classified,school__degrees_awarded_predominant_Predominantly associate's-degree granting,school__degrees_awarded_predominant_Predominantly bachelor's-degree granting,school__degrees_awarded_predominant_Predominantly certificate-degree granting
1917,0.0,0.0,660.0,1209.0,1209.0,6961.0,27.0,0.814184,29.0,0.0,33090.0,0.0,650.0,17627.0,0.843621,0.074074,0.156379,2324.0,0.9787,0.5872,33090.0,21.607682,0.589849,0.020576,0.0357,0.185816,0.0264,0.0,0.0445,0.0,0.0,0.0132,0.0692,0.0,0.0,0.0,0.0049,0.0,0.084,0.2521,0.0939,0.0,0.0,0.0,0.0346,0.0,0.1153,0.0,0.0,0.1005,0.0,0.0231,0.0033,0.0,0.0,0.0,0.0181,0.0049,0.0,0.0,0.0,0.112,12264.0,45.7,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0
9754,0.0,0.0,0.0,0.0,0.0,8496.0,0.0,0.480263,0.0,0.0,5732.0,0.0,0.0,2128.0,0.44,0.171378,0.56,7610.0,0.0688,1.0,3978.0,27.231289,0.500978,0.0272,0.641,0.519737,0.0004,0.0,0.0,0.4558,0.0,0.0,0.0011,0.0362,0.0168,0.0102,0.045,0.0088,0.0,0.114,0.0,0.0,0.0,0.0,0.0128,0.0391,0.0121,0.0647,0.0,0.0,0.1001,0.0,0.011,0.0015,0.0245,0.0351,0.0,0.0,0.011,0.0,0.0,0.0,13017.0,31.1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1
9549,0.0,0.0,0.0,0.0,0.0,4693.0,0.0,0.618061,0.0,0.0,43500.0,0.0,0.0,23318.0,0.567919,0.080925,0.432081,997.0,0.0,0.9206,43500.0,24.552023,0.456647,0.040462,0.0,0.381939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8994,0.0,0.0,0.0,5846.0,37.7,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0
15514,0.0,0.0,540.0,980.0,980.0,5741.0,22.0,0.59943,23.0,0.0,17527.0,0.0,510.0,2877.0,0.831895,0.043979,0.168105,2904.0,0.674,0.5932,8277.0,21.203453,0.480477,0.005343,0.0675,0.40057,0.0,0.1749,0.0,0.1443,0.0073,0.0,0.0,0.0437,0.0087,0.0131,0.0539,0.0,0.0117,0.1327,0.0,0.0058,0.0,0.0,0.0015,0.0729,0.0,0.0,0.0,0.0015,0.0219,0.0,0.0379,0.0,0.2434,0.0,0.0,0.0,0.0015,0.0044,0.019,0.0,10437.0,27.8,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0
15338,0.0,0.0,0.0,0.0,0.0,5797.635609,0.0,0.416357,0.0,0.0,11869.0,0.0,0.0,3950.0,0.344411,0.23565,0.655589,59.0,0.0,0.534723,11869.0,28.444109,0.818731,0.0,0.2542,0.583643,0.0,0.0,0.0,0.7313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2687,0.0,0.0,0.0,1809.0,22.1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1


### Limit DataFrame to required features
### Drop Nans

Drop Nans using fillna

In [85]:

#print df.head()
#display(df.head())
#print df.select('school__degrees_awarded_highest')

print ("Converting Nans to Mean")
#df.dropna(axis=0, how='any', inplace=True)
#X_train = X_train.fillna(0)
#X_test = X_test.fillna(0)
#label_train = df['income']
X_train = X_train.drop(labels=['income'],axis=1)
X_test = X_test.drop(labels=['income'],  axis=1)

print X_train.shape
print X_test.shape


Converting Nans to Mean
(16684, 92)
(84, 92)


### Normalize

In [86]:
#print X_train.isnull().sum()

In [87]:
T = preprocessing.RobustScaler().fit(X_train)
X_train = T.transform(X_train)
X_test = T.transform(X_test)


#display(pd.DataFrame(X_train).head())

### Using GridSearch to tune Hyper parameters

In [88]:
#regr_1 = linear_model.LogisticRegression(C=1e5
#reg2_1 = linear_model.LinearRegression()


#regr_1 = DecisionTreeRegressor(max_depth=8)
#regr_2 = DecisionTreeRegressor(max_depth=11)

regr_1 = GradientBoostingRegressor(loss='ls')
#regr_2 = GradientBoostingRegressor(max_depth=11)
#regr_1 = RandomForestRegressor()


#regr_1 = linear_model.BayesianRidge()

#regr_1 = AdaBoostRegressor(n_estimators=100)


params = {
         'n_estimators': [5000],
         'max_depth' : [5],
         'alpha': [0.9] 
            }


clf = model_selection.GridSearchCV(regr_1, params)

clf.fit(X_train, y_train)
#regr_2.fit(X_train, y_train)
print clf.best_estimator_



GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=5000,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)


In [89]:
#Reg1
#scores = cross_validation.cross_val_score(clf, X_train, y_train, scoring='neg_mean_squared_error', cv=3).mean()
#print sqrt(abs(scores))


### Calculating MSE

In [90]:


rms = sqrt(mean_squared_error(y_test, clf.predict(X_test)))

display(HTML("<H1>" + str(rms) + "</H1>"))


In [91]:
display(data_test.head())
display(data_test.shape)

Unnamed: 0,row_id,academics__program_assoc_agriculture,academics__program_assoc_architecture,academics__program_assoc_biological,academics__program_assoc_business_marketing,academics__program_assoc_communication,academics__program_assoc_communications_technology,academics__program_assoc_computer,academics__program_assoc_construction,academics__program_assoc_education,academics__program_assoc_engineering,academics__program_assoc_engineering_technology,academics__program_assoc_english,academics__program_assoc_ethnic_cultural_gender,academics__program_assoc_family_consumer_science,academics__program_assoc_health,academics__program_assoc_history,academics__program_assoc_humanities,academics__program_assoc_language,academics__program_assoc_legal,academics__program_assoc_library,academics__program_assoc_mathematics,academics__program_assoc_mechanic_repair_technology,academics__program_assoc_military,academics__program_assoc_multidiscipline,academics__program_assoc_parks_recreation_fitness,academics__program_assoc_personal_culinary,academics__program_assoc_philosophy_religious,academics__program_assoc_physical_science,academics__program_assoc_precision_production,academics__program_assoc_psychology,academics__program_assoc_public_administration_social_service,academics__program_assoc_resources,academics__program_assoc_science_technology,academics__program_assoc_security_law_enforcement,academics__program_assoc_social_science,academics__program_assoc_theology_religious_vocation,academics__program_assoc_transportation,academics__program_assoc_visual_performing,academics__program_bachelors_agriculture,academics__program_bachelors_architecture,academics__program_bachelors_biological,academics__program_bachelors_business_marketing,academics__program_bachelors_communication,academics__program_bachelors_communications_technology,academics__program_bachelors_computer,academics__program_bachelors_construction,academics__program_bachelors_education,academics__program_bachelors_engineering,academics__program_bachelors_engineering_technology,academics__program_bachelors_english,academics__program_bachelors_ethnic_cultural_gender,academics__program_bachelors_family_consumer_science,academics__program_bachelors_health,academics__program_bachelors_history,academics__program_bachelors_humanities,academics__program_bachelors_language,academics__program_bachelors_legal,academics__program_bachelors_library,academics__program_bachelors_mathematics,academics__program_bachelors_mechanic_repair_technology,academics__program_bachelors_military,academics__program_bachelors_multidiscipline,academics__program_bachelors_parks_recreation_fitness,academics__program_bachelors_personal_culinary,academics__program_bachelors_philosophy_religious,academics__program_bachelors_physical_science,academics__program_bachelors_precision_production,academics__program_bachelors_psychology,academics__program_bachelors_public_administration_social_service,academics__program_bachelors_resources,academics__program_bachelors_science_technology,academics__program_bachelors_security_law_enforcement,academics__program_bachelors_social_science,academics__program_bachelors_theology_religious_vocation,academics__program_bachelors_transportation,academics__program_bachelors_visual_performing,academics__program_certificate_lt_1_yr_agriculture,academics__program_certificate_lt_1_yr_architecture,academics__program_certificate_lt_1_yr_biological,academics__program_certificate_lt_1_yr_business_marketing,academics__program_certificate_lt_1_yr_communication,academics__program_certificate_lt_1_yr_communications_technology,academics__program_certificate_lt_1_yr_computer,academics__program_certificate_lt_1_yr_construction,academics__program_certificate_lt_1_yr_education,academics__program_certificate_lt_1_yr_engineering,academics__program_certificate_lt_1_yr_engineering_technology,academics__program_certificate_lt_1_yr_english,academics__program_certificate_lt_1_yr_ethnic_cultural_gender,academics__program_certificate_lt_1_yr_family_consumer_science,academics__program_certificate_lt_1_yr_health,academics__program_certificate_lt_1_yr_history,academics__program_certificate_lt_1_yr_humanities,academics__program_certificate_lt_1_yr_language,academics__program_certificate_lt_1_yr_legal,academics__program_certificate_lt_1_yr_library,academics__program_certificate_lt_1_yr_mathematics,academics__program_certificate_lt_1_yr_mechanic_repair_technology,academics__program_certificate_lt_1_yr_military,academics__program_certificate_lt_1_yr_multidiscipline,academics__program_certificate_lt_1_yr_parks_recreation_fitness,academics__program_certificate_lt_1_yr_personal_culinary,academics__program_certificate_lt_1_yr_philosophy_religious,academics__program_certificate_lt_1_yr_physical_science,academics__program_certificate_lt_1_yr_precision_production,academics__program_certificate_lt_1_yr_psychology,academics__program_certificate_lt_1_yr_public_administration_social_service,academics__program_certificate_lt_1_yr_resources,academics__program_certificate_lt_1_yr_science_technology,academics__program_certificate_lt_1_yr_security_law_enforcement,academics__program_certificate_lt_1_yr_social_science,academics__program_certificate_lt_1_yr_theology_religious_vocation,academics__program_certificate_lt_1_yr_transportation,academics__program_certificate_lt_1_yr_visual_performing,academics__program_certificate_lt_2_yr_agriculture,academics__program_certificate_lt_2_yr_architecture,academics__program_certificate_lt_2_yr_biological,academics__program_certificate_lt_2_yr_business_marketing,academics__program_certificate_lt_2_yr_communication,academics__program_certificate_lt_2_yr_communications_technology,academics__program_certificate_lt_2_yr_computer,academics__program_certificate_lt_2_yr_construction,academics__program_certificate_lt_2_yr_education,academics__program_certificate_lt_2_yr_engineering,academics__program_certificate_lt_2_yr_engineering_technology,academics__program_certificate_lt_2_yr_english,academics__program_certificate_lt_2_yr_ethnic_cultural_gender,academics__program_certificate_lt_2_yr_family_consumer_science,academics__program_certificate_lt_2_yr_health,academics__program_certificate_lt_2_yr_history,academics__program_certificate_lt_2_yr_humanities,academics__program_certificate_lt_2_yr_language,academics__program_certificate_lt_2_yr_legal,academics__program_certificate_lt_2_yr_library,academics__program_certificate_lt_2_yr_mathematics,academics__program_certificate_lt_2_yr_mechanic_repair_technology,academics__program_certificate_lt_2_yr_military,academics__program_certificate_lt_2_yr_multidiscipline,academics__program_certificate_lt_2_yr_parks_recreation_fitness,academics__program_certificate_lt_2_yr_personal_culinary,academics__program_certificate_lt_2_yr_philosophy_religious,academics__program_certificate_lt_2_yr_physical_science,academics__program_certificate_lt_2_yr_precision_production,academics__program_certificate_lt_2_yr_psychology,academics__program_certificate_lt_2_yr_public_administration_social_service,academics__program_certificate_lt_2_yr_resources,academics__program_certificate_lt_2_yr_science_technology,academics__program_certificate_lt_2_yr_security_law_enforcement,academics__program_certificate_lt_2_yr_social_science,academics__program_certificate_lt_2_yr_theology_religious_vocation,academics__program_certificate_lt_2_yr_transportation,academics__program_certificate_lt_2_yr_visual_performing,academics__program_certificate_lt_4_yr_agriculture,academics__program_certificate_lt_4_yr_architecture,academics__program_certificate_lt_4_yr_biological,academics__program_certificate_lt_4_yr_business_marketing,academics__program_certificate_lt_4_yr_communication,academics__program_certificate_lt_4_yr_communications_technology,academics__program_certificate_lt_4_yr_computer,academics__program_certificate_lt_4_yr_construction,academics__program_certificate_lt_4_yr_education,academics__program_certificate_lt_4_yr_engineering,academics__program_certificate_lt_4_yr_engineering_technology,academics__program_certificate_lt_4_yr_english,academics__program_certificate_lt_4_yr_ethnic_cultural_gender,academics__program_certificate_lt_4_yr_family_consumer_science,academics__program_certificate_lt_4_yr_health,academics__program_certificate_lt_4_yr_history,academics__program_certificate_lt_4_yr_humanities,academics__program_certificate_lt_4_yr_language,academics__program_certificate_lt_4_yr_legal,academics__program_certificate_lt_4_yr_library,academics__program_certificate_lt_4_yr_mathematics,academics__program_certificate_lt_4_yr_mechanic_repair_technology,academics__program_certificate_lt_4_yr_military,academics__program_certificate_lt_4_yr_multidiscipline,academics__program_certificate_lt_4_yr_parks_recreation_fitness,academics__program_certificate_lt_4_yr_personal_culinary,academics__program_certificate_lt_4_yr_philosophy_religious,academics__program_certificate_lt_4_yr_physical_science,academics__program_certificate_lt_4_yr_precision_production,academics__program_certificate_lt_4_yr_psychology,academics__program_certificate_lt_4_yr_public_administration_social_service,academics__program_certificate_lt_4_yr_resources,academics__program_certificate_lt_4_yr_science_technology,academics__program_certificate_lt_4_yr_security_law_enforcement,academics__program_certificate_lt_4_yr_social_science,academics__program_certificate_lt_4_yr_theology_religious_vocation,academics__program_certificate_lt_4_yr_transportation,academics__program_certificate_lt_4_yr_visual_performing,academics__program_percentage_agriculture,academics__program_percentage_architecture,academics__program_percentage_biological,academics__program_percentage_business_marketing,academics__program_percentage_communication,academics__program_percentage_communications_technology,academics__program_percentage_computer,academics__program_percentage_construction,academics__program_percentage_education,academics__program_percentage_engineering,academics__program_percentage_engineering_technology,academics__program_percentage_english,academics__program_percentage_ethnic_cultural_gender,academics__program_percentage_family_consumer_science,academics__program_percentage_health,academics__program_percentage_history,academics__program_percentage_humanities,academics__program_percentage_language,academics__program_percentage_legal,academics__program_percentage_library,academics__program_percentage_mathematics,academics__program_percentage_mechanic_repair_technology,academics__program_percentage_military,academics__program_percentage_multidiscipline,academics__program_percentage_parks_recreation_fitness,academics__program_percentage_personal_culinary,academics__program_percentage_philosophy_religious,academics__program_percentage_physical_science,academics__program_percentage_precision_production,academics__program_percentage_psychology,academics__program_percentage_public_administration_social_service,academics__program_percentage_resources,academics__program_percentage_science_technology,academics__program_percentage_security_law_enforcement,academics__program_percentage_social_science,academics__program_percentage_theology_religious_vocation,academics__program_percentage_transportation,academics__program_percentage_visual_performing,admissions__act_scores_25th_percentile_cumulative,admissions__act_scores_25th_percentile_english,admissions__act_scores_25th_percentile_math,admissions__act_scores_25th_percentile_writing,admissions__act_scores_75th_percentile_cumulative,admissions__act_scores_75th_percentile_english,admissions__act_scores_75th_percentile_math,admissions__act_scores_75th_percentile_writing,admissions__act_scores_midpoint_cumulative,admissions__act_scores_midpoint_english,admissions__act_scores_midpoint_math,admissions__act_scores_midpoint_writing,admissions__admission_rate_by_ope_id,admissions__admission_rate_overall,admissions__sat_scores_25th_percentile_critical_reading,admissions__sat_scores_25th_percentile_math,admissions__sat_scores_25th_percentile_writing,admissions__sat_scores_75th_percentile_critical_reading,admissions__sat_scores_75th_percentile_math,admissions__sat_scores_75th_percentile_writing,admissions__sat_scores_average_by_ope_id,admissions__sat_scores_average_overall,admissions__sat_scores_midpoint_critical_reading,admissions__sat_scores_midpoint_math,admissions__sat_scores_midpoint_writing,completion__completion_cohort_4yr_100nt,completion__completion_cohort_less_than_4yr_100nt,completion__completion_rate_4yr_100nt,completion__completion_rate_less_than_4yr_100nt,completion__transfer_rate_4yr_full_time,completion__transfer_rate_cohort_4yr_full_time,completion__transfer_rate_cohort_less_than_4yr_full_time,completion__transfer_rate_less_than_4yr_full_time,cost__tuition_in_state,cost__tuition_out_of_state,cost__tuition_program_year,report_year,school__degrees_awarded_highest,school__degrees_awarded_predominant,school__degrees_awarded_predominant_recoded,school__faculty_salary,school__ft_faculty_rate,school__institutional_characteristics_level,school__instructional_expenditure_per_fte,school__main_campus,school__online_only,school__ownership,school__region_id,school__state,school__tuition_revenue_per_fte,student__demographics_age_entry,student__demographics_dependent,student__demographics_female_share,student__demographics_first_generation,student__demographics_married,student__demographics_veteran,student__part_time_share,student__retention_rate_four_year_full_time,student__retention_rate_four_year_part_time,student__retention_rate_lt_four_year_full_time,student__retention_rate_lt_four_year_part_time,student__share_25_older,student__share_first_time_full_time,student__share_firstgeneration,student__share_firstgeneration_parents_highschool,student__share_firstgeneration_parents_middleschool,student__share_firstgeneration_parents_somecollege,student__share_independent_students,student__size
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,year_f,Graduate degree,Entirely graduate-degree granting,3,14400.0,0.3086,4-year,14722.0,Main campus,Not distance-education only,Public,"Far West (AK, CA, HI, NV, OR, WA)",dfy,21910.0,,,,,,,,,,,,,,,,,,,
1,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,year_w,Graduate degree,Predominantly bachelor's-degree granting,3,9255.0,0.4162,4-year,39001.0,Main campus,,Public,"Mid East (DE, DC, MD, NJ, NY, PA)",rya,14344.0,28.960983,0.210983,0.731214,0.370492,0.231214,0.020231,0.3605,,,,,0.7533,,0.370492,0.321311,,0.629508,0.789017,846.0
2,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,year_a,Bachelor's degree,Predominantly bachelor's-degree granting,3,7992.0,1.0,4-year,10238.0,Main campus,Not distance-education only,Private nonprofit,"Great Lakes (IL, IN, MI, OH, WI)",vvi,22322.0,28.155063,0.275316,0.876582,0.375887,0.193038,,0.2966,,,,,,,0.375887,0.315603,,0.624113,0.724684,354.0
3,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9853,0.0,0.0,0.0,0.0,0.0,0.0,0.0147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,1.0,1.0,,,,,,,,,,,,,56.0,,0.892857,,,56.0,0.0,,,31775.0,year_a,Certificate degree,Predominantly certificate-degree granting,1,,,Less-than-2-year,8150.0,Main campus,Not distance-education only,Private for-profit,"Southwest (AZ, NM, OK, TX)",bww,10362.0,26.335766,0.386861,,0.504132,0.240876,0.087591,0.0,,,0.9032,,,,0.504132,,,0.495868,0.613139,148.0
4,11,,,,1.0,,,,,,0.0,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,2.0,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,,,,1.0,,,,,,,,,,,,,,0.0,,,,,0.0,,,,,,,,,,,2.0,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,0.0,,,,,,,,,,,,,,,,,,1.0,,0.600032,0.6166,0.598959,0.866285,0.476323,0.179234,0.431844,0.419305,0.268399,0.056429,0.944354,0.820691,0.222606,0.289575,0.340027,0.126453,0.328525,0.361567,0.775555,,0.156528,0.427337,,0.144051,0.75867,0.339813,0.83949,0.575044,0.740126,0.353023,0.365551,0.134723,0.277803,0.395279,0.108393,0.58622,0.060568,0.509469,23.0,,,,,,,,,,,,0.193662,0.69758,,,,,,,,,,,,,1971.0,0.780145,0.165728,0.657008,,,0.202625,15888.0,29467.0,,year_f,Associate degree,Not classified,3,9080.0,0.991896,Less-than-2-year,,Not main campus,Not distance-education only,Private for-profit,"Great Lakes (IL, IN, MI, OH, WI)",por,10182.0,32.464487,0.47019,0.039706,0.770557,0.194266,0.069072,0.056924,0.937559,0.354656,0.974608,0.55733,0.308937,0.367908,0.69346,0.456512,,0.554882,0.218107,35004.0


(9192, 298)

### Data Transformation for Test Data Set

In [92]:
data_test['cost__tuition_program_year'].fillna(data_test['cost__tuition_program_year'].mean(), inplace=True)
data_test['cost__tuition_in_state'].fillna(data_test['cost__tuition_program_year'], inplace=True)
data_test['cost__tuition_out_of_state'].fillna(data_test['cost__tuition_program_year'], inplace=True)
data_test['school__faculty_salary'].fillna(data_test['school__faculty_salary'].mean(), inplace=True)
data_test['student__size'].fillna(data_test['student__size'].mean(), inplace=True)
data_test['school__tuition_revenue_per_fte'].fillna(data_test['school__tuition_revenue_per_fte'].mean(), inplace=True)
data_test['student__demographics_age_entry'].fillna(data_test['student__demographics_age_entry'].mean(), inplace=True)

#data_test['school__ft_faculty_rate'].fillna(data_test['school__ft_faculty_rate'].mean(), inplace=True)
#data_test['student__demographics_age_entry'].fillna(data_test['student__demographics_age_entry'].mean(), inplace=True)
#data_test['student__share_firstgeneration_parents_highschool'].fillna(data_test['student__share_firstgeneration_parents_highschool'].mean(), inplace=True)

#data_test['school__instructional_expenditure_per_fte'].fillna(data_test['school__instructional_expenditure_per_fte'].mean(), inplace=True)



#data_test.report_year = data_test.report_year.astype("category",
#  categories=['year_a', 'year_f', 'year_z', 'year_w']
#).cat.codes

#data_test.school__region_id = data_test.school__region_id.astype("category",
#  categories=['Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)',
#       'New England (CT, ME, MA, NH, RI, VT)',
#       'Plains (IA, KS, MN, MO, NE, ND, SD)',
#       'Mid East (DE, DC, MD, NJ, NY, PA)',
#       'Rocky Mountains (CO, ID, MT, UT, WY)',
#       'Far West (AK, CA, HI, NV, OR, WA)',
#       'Great Lakes (IL, IN, MI, OH, WI)', 'Southwest (AZ, NM, OK, TX)',
#       'Outlying Areas (AS, FM, GU, MH, MP, PR, PW, VI)']
#).cat.codes


#data_test.school__ownership = data_test.school__ownership.astype("category",
#  categories=['Private nonprofit', 'Private for-profit', 'Public']
#).cat.codes





#data_test.school__institutional_characteristics_level = data_test.school__institutional_characteristics_level.astype("category",
#  categories=ordered_school__institutional_characteristics_level
#).cat.codes

#data_test.school__degrees_awarded_highest = data_test.school__degrees_awarded_highest.astype("category",
#  categories=ordered_school__degrees_awarded_highest
#).cat.codes

In [93]:
data_test_reduced = data_test[[
#        'admissions__act_scores_25th_percentile_math',
        'admissions__act_scores_midpoint_math',
#        'admissions__sat_scores_25th_percentile_math',
#        'admissions__sat_scores_midpoint_math',
        'admissions__act_scores_75th_percentile_math',
        'school__degrees_awarded_highest',
        'admissions__sat_scores_75th_percentile_math',
#        'admissions__act_scores_25th_percentile_cumulative',
        'admissions__sat_scores_average_by_ope_id',
        'admissions__sat_scores_average_overall',
        'school__faculty_salary',
#        'admissions__sat_scores_25th_percentile_writing',
        'admissions__act_scores_midpoint_cumulative',
#        'admissions__sat_scores_midpoint_writing',
#        'admissions__act_scores_25th_percentile_english',                                     
        'student__share_firstgeneration_parents_somecollege',
        'admissions__act_scores_75th_percentile_cumulative',
#        'admissions__sat_scores_75th_percentile_writing',
#        'admissions__sat_scores_25th_percentile_critical_reading',
        'admissions__act_scores_midpoint_english',
        'cost__tuition_out_of_state',
#        'admissions__sat_scores_midpoint_critical_reading',  
        'admissions__act_scores_75th_percentile_english',
        'admissions__sat_scores_75th_percentile_critical_reading',
        
        'school__region_id',
        'school__tuition_revenue_per_fte',
        'report_year',
        'school__institutional_characteristics_level',
        'student__demographics_dependent',
        'student__demographics_married',
        'student__share_independent_students',
        'school__ownership', 
        'student__size',
        'school__degrees_awarded_predominant',
        'student__share_first_time_full_time',

    
        'cost__tuition_in_state',
        'student__demographics_age_entry',
        'student__demographics_female_share',
        'student__demographics_veteran',
        'student__part_time_share',

        'student__share_firstgeneration',
        'academics__program_percentage_language',
        'academics__program_percentage_humanities',
        'academics__program_percentage_history',
        'academics__program_percentage_health',
        'academics__program_percentage_family_consumer_science',
        'academics__program_percentage_ethnic_cultural_gender',
        'academics__program_percentage_english',
        'academics__program_percentage_engineering_technology',
        'academics__program_percentage_education',
        'academics__program_percentage_construction',
        'academics__program_percentage_computer',
        'academics__program_percentage_communications_technology',
        'academics__program_percentage_communication',
        'academics__program_percentage_business_marketing',
        'academics__program_percentage_biological',
        'academics__program_percentage_architecture',
        'academics__program_percentage_engineering',
        'academics__program_percentage_legal',
        'academics__program_percentage_multidiscipline',
        'academics__program_percentage_mechanic_repair_technology',
        'academics__program_percentage_visual_performing',
        'academics__program_percentage_transportation',
        'academics__program_percentage_theology_religious_vocation',
        'academics__program_percentage_social_science',
        'academics__program_percentage_security_law_enforcement',
        'academics__program_percentage_mathematics',
        'academics__program_percentage_resources',
        'academics__program_percentage_science_technology',
        'academics__program_percentage_agriculture',
        'academics__program_percentage_precision_production',
        'academics__program_percentage_physical_science',
        'academics__program_percentage_philosophy_religious',
        'academics__program_percentage_personal_culinary',
        'academics__program_percentage_parks_recreation_fitness',
        'academics__program_percentage_public_administration_social_service',
        'academics__program_percentage_psychology',
    
    
        'school__instructional_expenditure_per_fte'
    
    
        ]]

data_test_reduced = data_test_reduced.fillna(0)

data_test_reduced = pd.get_dummies(data_test_reduced)
print df.shape

data_test_reduced = T.transform(data_test_reduced)



(16768, 93)


### Running Predictions

In [94]:
#predictions = model.predict(data_test_reduced)

predictions = clf.predict(data_test_reduced)

In [95]:
results = pd.DataFrame(data_test['row_id'])
results['income'] = pd.DataFrame(predictions)
results = results.loc[:,['row_id', 'income']]

display(results.head())
print results.shape

Unnamed: 0,row_id,income
0,2,65.68277
1,8,78.182829
2,9,49.882914
3,10,43.598576
4,11,49.728205


(9192, 2)


In [96]:
results.to_csv("data-release/data/final/public/finalresult_17c.csv", index=False)