In [34]:
import os, sqlite3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import LinearSVR, SVC, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(9)

In [2]:
#  functions & pipelines

def score_printer(inst, Xtr, ytr, Xte, yte):
    return print(f" Train score: {inst.score(Xtr, ytr)}, Test Score: {inst.score(Xte, yte)}")

linreg_pipeline = Pipeline([('linreg', LinearRegression())])
lasso_pipeline = Pipeline([('lasso',Lasso(random_state=42))])
ridge_pipeline = Pipeline([('ridge',Ridge(random_state=42))])

In [3]:
processed_top5_feature_df = pd.read_pickle("./data/processed_top5_feature_df.pkl")

In [4]:
processed_top5_feature_df.shape

(34593, 9554)

In [5]:
processed_top25_overall_df = pd.read_pickle("./data/processed_top25_overall_df.pkl")

In [6]:
processed_top25_overall_df.shape

(34593, 27)

# Train_Test_Split

## Top 5 features from each category

In [7]:
X1 = processed_top5_feature_df.drop('COMP_ORIG_YR4_RT', axis=1).copy()
y1 = processed_top5_feature_df['COMP_ORIG_YR4_RT']
X1.shape, y1.shape

((34593, 9553), (34593,))

In [12]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

## Top 25 overall features

In [13]:
X2 = processed_top25_overall_df.drop('COMP_ORIG_YR4_RT', axis=1).copy()
y2 = processed_top25_overall_df['COMP_ORIG_YR4_RT']
X2.shape, y2.shape

((34593, 26), (34593,))

In [14]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Modelling

## Baseline Model - Top 5 features from each category

In [15]:
dr1 = DummyRegressor(strategy='median')
dr1.fit(X_train1, y_train1)
dr1.predict(X_train1)
dr1.score(X_train1, y_train1)

-0.004003885233955451

## Baseline Model - Top 25 overall features

In [16]:
dr2 = DummyRegressor(strategy='median')
dr2.fit(X_train2, y_train2)
dr2.predict(X_train2)
dr2.score(X_train2, y_train2)

-0.004003885233955451

## Simple Basic Model - Top 5 Features from each category

In [19]:
linreg_pipeline.fit(X_train1, y_train1)
score_printer(linreg_pipeline, X_train1, y_train1, X_test1, y_test1)

 Train score: 0.8894434631751102, Test Score: -1664862397.6829908


In [20]:
lasso_pipeline.fit(X_train1, y_train1)
score_printer(lasso_pipeline, X_train1, y_train1, X_test1, y_test1)

 Train score: 0.48826376668175187, Test Score: 0.4922897284143569


In [21]:
ridge_pipeline.fit(X_train1, y_train1)
score_printer(ridge_pipeline, X_train1, y_train1, X_test1, y_test1)

 Train score: 0.8820100610992159, Test Score: 0.810887936333579


## Simple Basic Model - Top 25 overall features

In [22]:
linreg_pipeline.fit(X_train2, y_train2)
score_printer(linreg_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.4274893545979581, Test Score: 0.4295066332831843


In [24]:
lasso_pipeline.fit(X_train2, y_train2)
score_printer(lasso_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.42007227542500736, Test Score: 0.4193902987980843


In [25]:
ridge_pipeline.fit(X_train2, y_train2)
score_printer(ridge_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.42747232956837056, Test Score: 0.42951839897329


The running time for the Simple Basic Models for the Top 5 Features from each category had a longer running time, but provided the better scores. Other models will be used with the top 5 features from each category and see if the score can be improved.


# Trying Different Models

## Linear Support Vector Regression

In [26]:
linsvr = LinearSVR(random_state=42)
linsvr.fit(X_train1, y_train1)



LinearSVR(random_state=42)

In [27]:
score_printer(linsvr, X_train1, y_train1, X_test1, y_test1)

 Train score: -1.4620444748816106, Test Score: -1.559660936625629


## Nu Support Vector Regression

In [28]:
regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1))
regr.fit(X_train1, y_train1)

Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvr', NuSVR(nu=0.1))])

In [29]:
score_printer(regr, X_train1, y_train1, X_test1, y_test1)

 Train score: 0.7944055899547355, Test Score: 0.7237557338565553


## Decision Tree Regressor

In [30]:
dtr = DecisionTreeRegressor(max_depth=10, min_samples_split=20, random_state=10)
dtr.fit(X_train1, y_train1)

DecisionTreeRegressor(max_depth=10, min_samples_split=20, random_state=10)

In [31]:
score_printer(dtr, X_train1, y_train1, X_test1, y_test1)

 Train score: 0.8532561405977047, Test Score: 0.781850015869409


## Decision Tree Regressor w GridSearchCV
## used originally to find best parameters but commented out to avoid running again
The decision tree gave us an amazing train score, but a low test score which indicates that the the model is overfitting. Will use GridsearchCV to see which parameters can be tuned in order to reduce overfitting and create a better model and then reapplied to the model to see how the train and test score perform.

In [65]:
# dtr = DecisionTreeRegressor(random_state=10)

In [70]:
# param_grid = {
#     'max_depth': [1, 2, 5, 10],
#     'min_samples_split': [5, 10, 20]
# }

In [74]:
# gs_tree = GridSearchCV(dtr, param_grid, cv=3)
# gs_tree.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeRegressor(random_state=10),
             param_grid={'max_depth': [1, 2, 5, 10],
                         'min_samples_split': [5, 10, 20]})

In [73]:
gs_tree.best_score_, gs_tree.best_params_

(0.7719798561970207, {'max_depth': 10, 'min_samples_split': 20})

## Random Forest Regressor

In [36]:
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_train1, y_train1)

RandomForestRegressor(random_state=42)

In [37]:
score_printer(rfr, X_train1, y_train1, X_test1, y_test1)

 Train score: 0.9769383850260506, Test Score: 0.8271455544689305


The Ridge regression model gave us the best train and test scores, but was overfitting. The random forest regressot gave us better scores, but was more overfitting that the ridge lasso because the ridge model addresses some of the problems of Ordinary Least Squares by imposing a penalty on the size of the coefficients.

# The below was a different approach where I layered in features and/or categories to see if there was one that would significantly improve the model's score. However, the score never significantly improved.

# Modelling Approach  2: 
## Models w top feature from each category
This approach added the top feature from each of the 10 categories

In [12]:
top_features = ['ZIP_60616-3878', 'SATMTMID', 'PCIP45', 'CIP51BACHL_1', 'UGDS_NRA', 'TUITIONFEE_OUT', 'CUML_DEBT_P10', 'COMP_ORIG_YR2_RT','MD_INC_RPY_1YR_RT','pct10_earn_wne_p10']

In [13]:
X2 = unpickled_df[top_features]
Y2 = unpickled_df['COMP_ORIG_YR4_RT']

In [14]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=0.2, random_state=42)

In [15]:
linreg_pipeline.fit(X_train2, y_train2)
score_printer(linreg_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.6313544569849286, Test Score: 0.6177837185384083


In [16]:
lasso_pipeline.fit(X_train2, y_train2)
score_printer(lasso_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.4348021530246138, Test Score: 0.4281706225146684


In [17]:
ridge_pipeline.fit(X_train2, y_train2)
score_printer(ridge_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.6313461414048559, Test Score: 0.6177964386509815


# Model 2A:
## add 2nd top feature from each category

In [84]:
second_top_feature = ['ZIP_97230-3099', 'SATMT25', 'PCIP54', 'CIP26BACHL_1', 'UGDS_ASIAN', 'COSTT4_A', 'CUML_DEBT_P25', 'LOAN_COMP_ORIG_YR2_RT', 'HI_INC_RPY_3YR_RT', 'mn_earn_wne_male1_p10']

In [85]:
first_second_features = top_features + second_top_feature

In [86]:
X2a = unpickled_df[first_second_features]
Y2a = unpickled_df['COMP_ORIG_YR4_RT']

In [87]:
X_train2a, X_test2a, y_train2a, y_test2a = train_test_split(X2a, Y2a, test_size=0.2, random_state=42)

In [88]:
linreg_pipeline.fit(X_train2a, y_train2a)
score_printer(linreg_pipeline, X_train2a, y_train2a, X_test2a, y_test2a)

 Train score: 0.6468685176721847, Test Score: 0.6390543556877776


In [89]:
lasso_pipeline.fit(X_train2a, y_train2a)
score_printer(lasso_pipeline, X_train2a, y_train2a, X_test2a, y_test2a)

 Train score: 0.45399978653578243, Test Score: 0.4563298807210854


In [90]:
ridge_pipeline.fit(X_train2a, y_train2a)
score_printer(ridge_pipeline, X_train2a, y_train2a, X_test2a, y_test2a)

 Train score: 0.6467788110047553, Test Score: 0.6390686037316503


# Model 2b
Add 3rd top feature from each category

In [91]:
third_top_feature = ['ZIP_38104-2211', 'SATMT75', 'PCIP23', 'CIP29CERT4_2', 'UGDS_UNKN', 'TUITIONFEE_IN', 'DEBT_MDN', 'COMP_ORIG_YR3_RT', 'HI_INC_RPY_1YR_RT', 'mn_earn_wne_indep0_p6']

In [92]:
first_second_third = top_features + second_top_feature + third_top_feature

In [93]:
X2b = unpickled_df[first_second_features]
Y2b = unpickled_df['COMP_ORIG_YR4_RT']

In [94]:
X_train2b, X_test2b, y_train2b, y_test2b = train_test_split(X2b, Y2b, test_size=0.2, random_state=42)

In [95]:
linreg_pipeline.fit(X_train2b, y_train2b)
score_printer(linreg_pipeline, X_train2b, y_train2b, X_test2b, y_test2b)

 Train score: 0.6468685176721847, Test Score: 0.6390543556877776


In [96]:
lasso_pipeline.fit(X_train2b, y_train2b)
score_printer(lasso_pipeline, X_train2b, y_train2b, X_test2b, y_test2b)

 Train score: 0.45399978653578243, Test Score: 0.4563298807210854


In [97]:
ridge_pipeline.fit(X_train2b, y_train2b)
score_printer(ridge_pipeline, X_train2b, y_train2b, X_test2b, y_test2b)

 Train score: 0.6467788110047553, Test Score: 0.6390686037316503


It seems that adding features in this method doesn't significantly improve the score for each model.

# Model Approach 3: 
## Model w features from top 3 categories

In [17]:
top_3_cat_features = ['TUITIONFEE_OUT', 'COSTT4_A', 'TUITIONFEE_IN', 'NPT45_PUB', 'NPT4_75UP_PUB', 'PCIP45', 'PCIP54', 'PCIP23', 'PCIP27', 'PCIP16', 'PCIP45', 'PCIP54', 'PCIP23', 'PCIP27', 'PCIP16', 'ZIP_60616-3878', 'ZIP_97230-3099', 'ZIP_38104-2211', 'ZIP_2115', 'CITY_Seneca Falls']

In [19]:
X3 = unpickled_df[top_3_cat_features]
Y3 = unpickled_df['COMP_ORIG_YR4_RT']

In [21]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, Y3, test_size=0.2, random_state=42)

In [76]:
linreg_pipeline.fit(X_train3, y_train3)
score_printer(linreg_pipeline, X_train3, y_train3, X_test3, y_test3)

 Train score: 0.39927501260539333, Test Score: 0.40106373606238743


In [77]:
lasso_pipeline.fit(X_train3, y_train3)
score_printer(lasso_pipeline, X_train3, y_train3, X_test3, y_test3)

 Train score: 0.3883241639592627, Test Score: 0.390210367617409


In [78]:
ridge_pipeline.fit(X_train3, y_train3)
score_printer(ridge_pipeline, X_train3, y_train3, X_test3, y_test3)

 Train score: 0.39909646024492684, Test Score: 0.4009165638478991


# Model Approach 3a:
## add 4th category

In [99]:
fourth_cat_features = ['CUML_DEBT_P10', 'CUML_DEBT_P25', 'DEBT_MDN', 'CUML_DEBT_P75', 'IND_DEBT_MDN']

In [109]:
top_4_cat_features = top_3_cat_features + fourth_cat_features

In [110]:
X3a = unpickled_df[top_3_cat_features]
Y3a = unpickled_df['COMP_ORIG_YR4_RT']

In [111]:
X_train3a, X_test3a, y_train3a, y_test3a = train_test_split(X3a, Y3a, test_size=0.2, random_state=42)

In [112]:
linreg_pipeline.fit(X_train3a, y_train3a)
score_printer(linreg_pipeline, X_train3a, y_train3a, X_test3a, y_test3a)

 Train score: 0.39927501260539333, Test Score: 0.40106373606238743


In [113]:
lasso_pipeline.fit(X_train3a, y_train3a)
score_printer(lasso_pipeline, X_train3a, y_train3a, X_test3a, y_test3a)

 Train score: 0.3883241639592627, Test Score: 0.390210367617409


In [114]:
ridge_pipeline.fit(X_train3a, y_train3a)
score_printer(ridge_pipeline, X_train3a, y_train3a, X_test3a, y_test3a)

 Train score: 0.39909646024492684, Test Score: 0.4009165638478991


# Model Approach 3b:
## Add 5th category

In [106]:
fifth_cat_features = ['pct10_earn_wne_p10', 'mn_earn_wne_male1_p10', 'mn_earn_wne_indep0_p6', 'mn_earn_wne_inc1_p10', 'pct25_earn_wne_p10']

In [107]:
top_5_cat_features = top_3_cat_features + fourth_cat_features + fifth_cat_features

In [108]:
X3b = unpickled_df[top_3_cat_features]
Y3b = unpickled_df['COMP_ORIG_YR4_RT']

In [115]:
X_train3b, X_test3b, y_train3b, y_test3b = train_test_split(X3b, Y3b, test_size=0.2, random_state=42)

In [121]:
linreg_pipeline.fit(X_train3b, y_train3b)
score_printer(linreg_pipeline, X_train3b, y_train3b, X_test3b, y_test3b)

 Train score: 0.39927501260539333, Test Score: 0.40106373606238743


In [122]:
lasso_pipeline.fit(X_train3a, y_train3a)
score_printer(lasso_pipeline, X_train3a, y_train3b, X_test3b, y_test3b)

 Train score: 0.3883241639592627, Test Score: 0.390210367617409


In [123]:
ridge_pipeline.fit(X_train3b, y_train3b)
score_printer(ridge_pipeline, X_train3b, y_train3b, X_test3b, y_test3b)

 Train score: 0.39909646024492684, Test Score: 0.4009165638478991


no significant improvements with implementing models this way