In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
from sklearn.impute import KNNImputer

np.random.seed(9)

In [2]:
unpickled_df = pd.read_pickle("./data/processed_final_df.pkl")

In [3]:
unpickled_df.shape

(34593, 9564)

In [4]:
unpickled_df.head()

Unnamed: 0,UNITID,SATMTMID,SATMT25,SATMT75,SAT_AVG_ALL,ACTEN25,PCIP45,PCIP54,PCIP23,PCIP27,...,CIP51BACHL_2,CIP26BACHL_1,CIP26BACHL_2,CIP29CERT4_1,CIP29CERT4_2,CIP25CERT4_1,CIP25CERT4_2,CIP10CERT4_1,CIP10CERT4_2,COMP_ORIG_YR4_RT
0,100654.0,417.5,370.0,465.0,850.0,15.0,0.0465,0.0039,0.0058,0.0136,...,0,1,0,0,0,0,0,0,0,0.214286
1,100663.0,570.0,500.0,640.0,1030.0,19.0,0.0435,0.0374,0.0251,0.0049,...,0,1,0,0,0,0,0,0,0,0.385975
2,100690.0,585.0,445.0,725.0,963.5,14.5,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0.454545
3,100706.0,575.0,510.0,640.0,1129.0,21.0,0.0246,0.0055,0.0355,0.0041,...,0,1,0,0,0,0,0,0,0,0.236842
4,100724.0,400.0,340.0,460.0,784.0,13.0,0.0172,0.0123,0.0025,0.0098,...,0,1,0,0,0,0,0,0,0,0.117182


# Train_Test_Split

In [6]:
X = unpickled_df.drop('COMP_ORIG_YR4_RT', axis=1).copy()
y = unpickled_df['COMP_ORIG_YR4_RT']
X.shape, y.shape

((34593, 9563), (34593,))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline Model

In [8]:
dr = DummyRegressor(strategy='median')
dr.fit(X_train, y_train)
dr.predict(X_train)
dr.score(X_train, y_train)

-0.004003885233955451

# Model 1: Simple Basic Model

In [9]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [10]:
linreg.score(X_train, y_train)

0.889510058917237

In [11]:
linreg.score(X_test, y_test)

0.8005050761983334

# Model 2: Model w top feature from each category

In [12]:
top_features = ['ZIP_60616-3878', 'SATMTMID', 'PCIP45', 'CIP51BACHL_1', 'UGDS_NRA', 'TUITIONFEE_OUT', 'CUML_DEBT_P10', 'COMP_ORIG_YR2_RT','MD_INC_RPY_1YR_RT','pct10_earn_wne_p10']

In [13]:
X2 = unpickled_df[top_features]
Y2 = unpickled_df['COMP_ORIG_YR4_RT']

In [14]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=0.2, random_state=42)

In [15]:
linreg = LinearRegression()
linreg.fit(X_train2, y_train2)
linreg.score(X_train2, y_train2)

0.6313544569849286

In [16]:
linreg.score(X_test2, y_test2)

0.6177837185384083

# Model 3: