In [124]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.dummy import DummyRegressor
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
from sklearn.impute import KNNImputer

np.random.seed(9)

In [74]:
#  functions & pipelines

def score_printer(inst, Xtr, ytr, Xte, yte):
    return print(f" Train score: {inst.score(Xtr, ytr)}, Test Score: {inst.score(Xte, yte)}")

linreg_pipeline = Pipeline([('linreg', LinearRegression())])
lasso_pipeline = Pipeline([('lasso',Lasso(random_state=42))])
ridge_pipeline = Pipeline([('ridge',Ridge(random_state=42))])

In [2]:
unpickled_df = pd.read_pickle("./data/processed_final_df.pkl")

In [3]:
unpickled_df.shape

(34593, 9564)

In [4]:
unpickled_df.head()

Unnamed: 0,UNITID,SATMTMID,SATMT25,SATMT75,SAT_AVG_ALL,ACTEN25,PCIP45,PCIP54,PCIP23,PCIP27,...,CIP51BACHL_2,CIP26BACHL_1,CIP26BACHL_2,CIP29CERT4_1,CIP29CERT4_2,CIP25CERT4_1,CIP25CERT4_2,CIP10CERT4_1,CIP10CERT4_2,COMP_ORIG_YR4_RT
0,100654.0,417.5,370.0,465.0,850.0,15.0,0.0465,0.0039,0.0058,0.0136,...,0,1,0,0,0,0,0,0,0,0.214286
1,100663.0,570.0,500.0,640.0,1030.0,19.0,0.0435,0.0374,0.0251,0.0049,...,0,1,0,0,0,0,0,0,0,0.385975
2,100690.0,585.0,445.0,725.0,963.5,14.5,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.454545
3,100706.0,575.0,510.0,640.0,1129.0,21.0,0.0246,0.0055,0.0355,0.0041,...,0,1,0,0,0,0,0,0,0,0.236842
4,100724.0,400.0,340.0,460.0,784.0,13.0,0.0172,0.0123,0.0025,0.0098,...,0,1,0,0,0,0,0,0,0,0.117182


# Train_Test_Split

In [5]:
X = unpickled_df.drop('COMP_ORIG_YR4_RT', axis=1).copy()
y = unpickled_df['COMP_ORIG_YR4_RT']
X.shape, y.shape

((34593, 9563), (34593,))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline Model

In [7]:
dr = DummyRegressor(strategy='median')
dr.fit(X_train, y_train)
dr.predict(X_train)
dr.score(X_train, y_train)

-0.004003885233955451

# Model 1: Simple Basic Model

In [82]:
linreg_pipeline.fit(X_train, y_train)
score_printer(linreg_pipeline, X_train, y_train, X_test, y_test)

 Train score: 0.889510058917237, Test Score: 0.8005050761983334


In [83]:
lasso_pipeline.fit(X_train, y_train)
score_printer(lasso_pipeline, X_train, y_train, X_test, y_test)

 Train score: 0.48826376668175187, Test Score: 0.4922897284143569


In [81]:
ridge_pipeline.fit(X_train, y_train)
score_printer(ridge_pipeline, X_train, y_train, X_test, y_test)

 Train score: 0.8820102732543174, Test Score: 0.8108656759129569


The running time is the longest with this, but has provided the best scores. Will Use all the features to continue modelling and adjust parameters to reduce overfitting.

# Model 1a:
## Trying different models

In [None]:
trying the polynomial was too time consuming so paused it.

In [128]:
# poly = PolynomialFeatures(3)
# X_fin = poly.fit_transform(X)

In [None]:
linsvr = LinearSVR(random_state=42,  max_iter=1000000)
linsvr.fit(X_train, y_train)

In [None]:
score_printer(linsvr, X_train, y_train, X_test, y_test)

# Modelling Approach  2: 
## Models w top feature from each category

In [11]:
top_features = ['ZIP_60616-3878', 'SATMTMID', 'PCIP45', 'CIP51BACHL_1', 'UGDS_NRA', 'TUITIONFEE_OUT', 'CUML_DEBT_P10', 'COMP_ORIG_YR2_RT','MD_INC_RPY_1YR_RT','pct10_earn_wne_p10']

In [32]:
X2 = unpickled_df[top_features]
Y2 = unpickled_df['COMP_ORIG_YR4_RT']

In [33]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=0.2, random_state=42)

In [75]:
linreg_pipeline.fit(X_train2, y_train2)
score_printer(linreg_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.6313544569849286, Test Score: 0.6177837185384083


In [70]:
lasso_pipeline.fit(X_train2, y_train2)
score_printer(lasso_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.4348021530246138, Test Score: 0.4281706225146684


In [69]:
ridge_pipeline.fit(X_train2, y_train2)
score_printer(ridge_pipeline, X_train2, y_train2, X_test2, y_test2)

 Train score: 0.6313461414048559, Test Score: 0.6177964386509815


# Model 2A:
## add 2nd top feature from each category

In [84]:
second_top_feature = ['ZIP_97230-3099', 'SATMT25', 'PCIP54', 'CIP26BACHL_1', 'UGDS_ASIAN', 'COSTT4_A', 'CUML_DEBT_P25', 'LOAN_COMP_ORIG_YR2_RT', 'HI_INC_RPY_3YR_RT', 'mn_earn_wne_male1_p10']

In [85]:
first_second_features = top_features + second_top_feature

In [86]:
X2a = unpickled_df[first_second_features]
Y2a = unpickled_df['COMP_ORIG_YR4_RT']

In [87]:
X_train2a, X_test2a, y_train2a, y_test2a = train_test_split(X2a, Y2a, test_size=0.2, random_state=42)

In [88]:
linreg_pipeline.fit(X_train2a, y_train2a)
score_printer(linreg_pipeline, X_train2a, y_train2a, X_test2a, y_test2a)

 Train score: 0.6468685176721847, Test Score: 0.6390543556877776


In [89]:
lasso_pipeline.fit(X_train2a, y_train2a)
score_printer(lasso_pipeline, X_train2a, y_train2a, X_test2a, y_test2a)

 Train score: 0.45399978653578243, Test Score: 0.4563298807210854


In [90]:
ridge_pipeline.fit(X_train2a, y_train2a)
score_printer(ridge_pipeline, X_train2a, y_train2a, X_test2a, y_test2a)

 Train score: 0.6467788110047553, Test Score: 0.6390686037316503


# Model 2b
Add 3rd top feature from each category

In [91]:
third_top_feature = ['ZIP_38104-2211', 'SATMT75', 'PCIP23', 'CIP29CERT4_2', 'UGDS_UNKN', 'TUITIONFEE_IN', 'DEBT_MDN', 'COMP_ORIG_YR3_RT', 'HI_INC_RPY_1YR_RT', 'mn_earn_wne_indep0_p6']

In [92]:
first_second_third = top_features + second_top_feature + third_top_feature

In [93]:
X2b = unpickled_df[first_second_features]
Y2b = unpickled_df['COMP_ORIG_YR4_RT']

In [94]:
X_train2b, X_test2b, y_train2b, y_test2b = train_test_split(X2b, Y2b, test_size=0.2, random_state=42)

In [95]:
linreg_pipeline.fit(X_train2b, y_train2b)
score_printer(linreg_pipeline, X_train2b, y_train2b, X_test2b, y_test2b)

 Train score: 0.6468685176721847, Test Score: 0.6390543556877776


In [96]:
lasso_pipeline.fit(X_train2b, y_train2b)
score_printer(lasso_pipeline, X_train2b, y_train2b, X_test2b, y_test2b)

 Train score: 0.45399978653578243, Test Score: 0.4563298807210854


In [97]:
ridge_pipeline.fit(X_train2b, y_train2b)
score_printer(ridge_pipeline, X_train2b, y_train2b, X_test2b, y_test2b)

 Train score: 0.6467788110047553, Test Score: 0.6390686037316503


It seems that adding features in this method doesn't significantly improve the score for each model.

# Model Approach 3: 
## Model w features from top 3 categories

In [17]:
top_3_cat_features = ['TUITIONFEE_OUT', 'COSTT4_A', 'TUITIONFEE_IN', 'NPT45_PUB', 'NPT4_75UP_PUB', 'PCIP45', 'PCIP54', 'PCIP23', 'PCIP27', 'PCIP16', 'PCIP45', 'PCIP54', 'PCIP23', 'PCIP27', 'PCIP16', 'ZIP_60616-3878', 'ZIP_97230-3099', 'ZIP_38104-2211', 'ZIP_2115', 'CITY_Seneca Falls']

In [19]:
X3 = unpickled_df[top_3_cat_features]
Y3 = unpickled_df['COMP_ORIG_YR4_RT']

In [21]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, Y3, test_size=0.2, random_state=42)

In [76]:
linreg_pipeline.fit(X_train3, y_train3)
score_printer(linreg_pipeline, X_train3, y_train3, X_test3, y_test3)

 Train score: 0.39927501260539333, Test Score: 0.40106373606238743


In [77]:
lasso_pipeline.fit(X_train3, y_train3)
score_printer(lasso_pipeline, X_train3, y_train3, X_test3, y_test3)

 Train score: 0.3883241639592627, Test Score: 0.390210367617409


In [78]:
ridge_pipeline.fit(X_train3, y_train3)
score_printer(ridge_pipeline, X_train3, y_train3, X_test3, y_test3)

 Train score: 0.39909646024492684, Test Score: 0.4009165638478991


# Model Approach 3a:
## add 4th category

In [99]:
fourth_cat_features = ['CUML_DEBT_P10', 'CUML_DEBT_P25', 'DEBT_MDN', 'CUML_DEBT_P75', 'IND_DEBT_MDN']

In [109]:
top_4_cat_features = top_3_cat_features + fourth_cat_features

In [110]:
X3a = unpickled_df[top_3_cat_features]
Y3a = unpickled_df['COMP_ORIG_YR4_RT']

In [111]:
X_train3a, X_test3a, y_train3a, y_test3a = train_test_split(X3a, Y3a, test_size=0.2, random_state=42)

In [112]:
linreg_pipeline.fit(X_train3a, y_train3a)
score_printer(linreg_pipeline, X_train3a, y_train3a, X_test3a, y_test3a)

 Train score: 0.39927501260539333, Test Score: 0.40106373606238743


In [113]:
lasso_pipeline.fit(X_train3a, y_train3a)
score_printer(lasso_pipeline, X_train3a, y_train3a, X_test3a, y_test3a)

 Train score: 0.3883241639592627, Test Score: 0.390210367617409


In [114]:
ridge_pipeline.fit(X_train3a, y_train3a)
score_printer(ridge_pipeline, X_train3a, y_train3a, X_test3a, y_test3a)

 Train score: 0.39909646024492684, Test Score: 0.4009165638478991


# Model Approach 3b:
## Add 5th category

In [106]:
fifth_cat_features = ['pct10_earn_wne_p10', 'mn_earn_wne_male1_p10', 'mn_earn_wne_indep0_p6', 'mn_earn_wne_inc1_p10', 'pct25_earn_wne_p10']

In [107]:
top_5_cat_features = top_3_cat_features + fourth_cat_features + fifth_cat_features

In [108]:
X3b = unpickled_df[top_3_cat_features]
Y3b = unpickled_df['COMP_ORIG_YR4_RT']

In [115]:
X_train3b, X_test3b, y_train3b, y_test3b = train_test_split(X3b, Y3b, test_size=0.2, random_state=42)

In [121]:
linreg_pipeline.fit(X_train3b, y_train3b)
score_printer(linreg_pipeline, X_train3b, y_train3b, X_test3b, y_test3b)

 Train score: 0.39927501260539333, Test Score: 0.40106373606238743


In [122]:
lasso_pipeline.fit(X_train3a, y_train3a)
score_printer(lasso_pipeline, X_train3a, y_train3b, X_test3b, y_test3b)

 Train score: 0.3883241639592627, Test Score: 0.390210367617409


In [123]:
ridge_pipeline.fit(X_train3b, y_train3b)
score_printer(ridge_pipeline, X_train3b, y_train3b, X_test3b, y_test3b)

 Train score: 0.39909646024492684, Test Score: 0.4009165638478991


no significant improvements with implementing models this way