In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from patsy import dmatrices
import statsmodels.api as sm
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
%matplotlib inline

# Import data from math and from portuguese students
math_df = pd.read_csv('student-mat.csv', sep=';')
port_df = pd.read_csv('student-por.csv', sep=';')

In [2]:
math_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [3]:
port_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [4]:
# Get list of column/feature names for each
m_cols = math_df.columns
p_cols = port_df.columns

# Check for shape of df
print(math_df.shape)
# ---> 395 observations, 33 features
print(port_df.shape)
# ---> 649 observations, 33 features

# Check if columns in each are identical
print((m_cols == p_cols).all())

(395, 33)
(649, 33)
True


In [5]:
# Make list of binary features, based on feature descriptions
binary_vars = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
categorical_vars = ['Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health']
appx_cont_vars = ['age', 'absences', 'G1', 'G2', 'G3']

In [6]:
# Function for getting a datatype subset df for each of my main dfs (math and port)
def make_dtype_dfs(df, binary_vars, categorical_vars, appx_cont_vars):
    b_df = df[binary_vars]
    cat_df = df[categorical_vars]
    cont_df = df[appx_cont_vars]
    return b_df, cat_df, cont_df

In [7]:
# Call like this:
m_bin_df, m_cat_df, m_cont_df = make_dtype_dfs(math_df, binary_vars, categorical_vars, appx_cont_vars)
p_bin_df, p_cat_df, p_cont_df = make_dtype_dfs(port_df, binary_vars, categorical_vars, appx_cont_vars)

In [8]:
m_bin_df.head()

Unnamed: 0,school,sex,address,famsize,Pstatus,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,GP,F,U,GT3,A,yes,no,no,no,yes,yes,no,no
1,GP,F,U,GT3,T,no,yes,no,no,no,yes,yes,no
2,GP,F,U,LE3,T,yes,no,yes,no,yes,yes,yes,no
3,GP,F,U,GT3,T,no,yes,yes,yes,yes,yes,yes,yes
4,GP,F,U,GT3,T,no,yes,yes,no,yes,yes,no,no


In [9]:
p_bin_df.head()

Unnamed: 0,school,sex,address,famsize,Pstatus,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,GP,F,U,GT3,A,yes,no,no,no,yes,yes,no,no
1,GP,F,U,GT3,T,no,yes,no,no,no,yes,yes,no
2,GP,F,U,LE3,T,yes,no,no,no,yes,yes,yes,no
3,GP,F,U,GT3,T,no,yes,no,yes,yes,yes,yes,yes
4,GP,F,U,GT3,T,no,yes,no,no,yes,yes,no,no


In [10]:
print(m_bin_df.shape, m_cat_df.shape, m_cont_df.shape)
print(p_bin_df.shape, p_cat_df.shape, p_cont_df.shape)

(395, 13) (395, 15) (395, 5)
(649, 13) (649, 15) (649, 5)


In [11]:
#Making function so that I can do this easily for both data sets
def convert_dummies(df, binary_vars):
    encoding_dict = dict()
    for col_name in binary_vars:
        new_col_name = col_name + '_d'
        # Make dictionary for individual col values
        col_dict = dict()
        # Grab unique values for given column
        unique_vals = df[col_name].unique()
        # Sort 'em
        unique_vals.sort()
        # If the first (of two) values is in this list is one of these...
        if unique_vals[0] in ['no', 'F', 'R']:
            # Then we'll encode the second value to equal 1 (first is baseline)
            df[new_col_name] = np.where(df[col_name] == unique_vals[1], 1, 0)
            # Add this info to individual column's disctionary
            col_dict[unique_vals[0]] = 0
            col_dict[unique_vals[1]] = 1
        # Do opposite (switch 1 and 0) for columns with other values
        else:
            df[new_col_name] = np.where(df[col_name] == unique_vals[0], 1, 0)
            col_dict[unique_vals[0]] = 1
            col_dict[unique_vals[1]] = 0
        # Add the column name and its encoding dictionary to the larger dictionary, so we can keep track of baseline values
        encoding_dict[col_name] = col_dict
    df.drop(columns=binary_vars, axis=1, inplace=True)
    return df, encoding_dict

In [12]:
# Use function to get new encoded dummy dfs and encoding dictionaries
m_bin_df, m_encoding_dict = convert_dummies(m_bin_df, binary_vars)
p_bin_df, p_encoding_dict = convert_dummies(p_bin_df, binary_vars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [13]:
m_bin_df.head()

Unnamed: 0,school_d,sex_d,address_d,famsize_d,Pstatus_d,schoolsup_d,famsup_d,paid_d,activities_d,nursery_d,higher_d,internet_d,romantic_d
0,1,0,1,1,1,1,0,0,0,1,1,0,0
1,1,0,1,1,0,0,1,0,0,0,1,1,0
2,1,0,1,0,0,1,0,1,0,1,1,1,0
3,1,0,1,1,0,0,1,1,1,1,1,1,1
4,1,0,1,1,0,0,1,1,0,1,1,0,0


In [14]:
p_bin_df.head()

Unnamed: 0,school_d,sex_d,address_d,famsize_d,Pstatus_d,schoolsup_d,famsup_d,paid_d,activities_d,nursery_d,higher_d,internet_d,romantic_d
0,1,0,1,1,1,1,0,0,0,1,1,0,0
1,1,0,1,1,0,0,1,0,0,0,1,1,0
2,1,0,1,0,0,1,0,0,0,1,1,1,0
3,1,0,1,1,0,0,1,0,1,1,1,1,1
4,1,0,1,1,0,0,1,0,0,1,1,0,0


In [15]:
print(m_bin_df.shape, p_bin_df.shape)

(395, 13) (649, 13)


In [16]:
# Remove _d in column names... 
cols = m_bin_df.columns
cols = [col[:-2] for col in cols] 
m_bin_df.columns = cols
p_bin_df.columns = cols

In [17]:
print(m_bin_df.columns)
print(p_bin_df.columns)

Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup',
       'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic'],
      dtype='object')
Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup',
       'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic'],
      dtype='object')


In [18]:
m_bin_df.head()

Unnamed: 0,school,sex,address,famsize,Pstatus,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,1,0,1,1,1,1,0,0,0,1,1,0,0
1,1,0,1,1,0,0,1,0,0,0,1,1,0
2,1,0,1,0,0,1,0,1,0,1,1,1,0
3,1,0,1,1,0,0,1,1,1,1,1,1,1
4,1,0,1,1,0,0,1,1,0,1,1,0,0


In [19]:
p_bin_df.head()

Unnamed: 0,school,sex,address,famsize,Pstatus,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,1,0,1,1,1,1,0,0,0,1,1,0,0
1,1,0,1,1,0,0,1,0,0,0,1,1,0
2,1,0,1,0,0,1,0,0,0,1,1,1,0
3,1,0,1,1,0,0,1,0,1,1,1,1,1
4,1,0,1,1,0,0,1,0,0,1,1,0,0


In [20]:
# Print resulting encoding dictionary to make sure it's correctly specified
print(m_encoding_dict)
print(p_encoding_dict)
print(m_encoding_dict == p_encoding_dict)

{'school': {'GP': 1, 'MS': 0}, 'sex': {'F': 0, 'M': 1}, 'address': {'R': 0, 'U': 1}, 'famsize': {'GT3': 1, 'LE3': 0}, 'Pstatus': {'A': 1, 'T': 0}, 'schoolsup': {'no': 0, 'yes': 1}, 'famsup': {'no': 0, 'yes': 1}, 'paid': {'no': 0, 'yes': 1}, 'activities': {'no': 0, 'yes': 1}, 'nursery': {'no': 0, 'yes': 1}, 'higher': {'no': 0, 'yes': 1}, 'internet': {'no': 0, 'yes': 1}, 'romantic': {'no': 0, 'yes': 1}}
{'school': {'GP': 1, 'MS': 0}, 'sex': {'F': 0, 'M': 1}, 'address': {'R': 0, 'U': 1}, 'famsize': {'GT3': 1, 'LE3': 0}, 'Pstatus': {'A': 1, 'T': 0}, 'schoolsup': {'no': 0, 'yes': 1}, 'famsup': {'no': 0, 'yes': 1}, 'paid': {'no': 0, 'yes': 1}, 'activities': {'no': 0, 'yes': 1}, 'nursery': {'no': 0, 'yes': 1}, 'higher': {'no': 0, 'yes': 1}, 'internet': {'no': 0, 'yes': 1}, 'romantic': {'no': 0, 'yes': 1}}
True


NOTE: NO LONGER GOING TO USE BASELINES IN OG MODEL? PICK BASELINES WHEN PICKING X_COLS...
        ...still have a dictionary of vars that need baselines, w/ potential baseline vars to pick from... 
        but will use that to select them later on (not limiting the DataFrame here)

In [21]:
# Make multi-value dummy dataframes
def make_dummy_df(df, categorical_vars):
    # Create empty list that we can insert dummy dataframes into - iterate through to join w/ main df - pick all but last column (One-Hot Encoding)
    dummy_df_lst = []
    potential_baselines = dict()
    # Iterate through list of column names we've picked out
    for col in categorical_vars:
        dummy_df = pd.get_dummies(df[col], prefix=col)
        # Append last column to list of baselines for future reference
        potential_baselines[col] = list(dummy_df.columns)
        dummy_df_lst.append(dummy_df)
    for dummy_df in dummy_df_lst:
        df = df.join(dummy_df)    
    df.drop(columns=categorical_vars, axis=1, inplace=True)
    return potential_baselines, df

In [22]:
# Examine baselines
m_baselines, m_cat_df = make_dummy_df(m_cat_df, categorical_vars)
p_baselines, p_cat_df = make_dummy_df(p_cat_df, categorical_vars)
print(m_baselines)
print(p_baselines)
print(m_baselines == p_baselines)

{'Medu': ['Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4'], 'Fedu': ['Fedu_0', 'Fedu_1', 'Fedu_2', 'Fedu_3', 'Fedu_4'], 'Mjob': ['Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher'], 'Fjob': ['Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher'], 'reason': ['reason_course', 'reason_home', 'reason_other', 'reason_reputation'], 'guardian': ['guardian_father', 'guardian_mother', 'guardian_other'], 'traveltime': ['traveltime_1', 'traveltime_2', 'traveltime_3', 'traveltime_4'], 'studytime': ['studytime_1', 'studytime_2', 'studytime_3', 'studytime_4'], 'failures': ['failures_0', 'failures_1', 'failures_2', 'failures_3'], 'famrel': ['famrel_1', 'famrel_2', 'famrel_3', 'famrel_4', 'famrel_5'], 'freetime': ['freetime_1', 'freetime_2', 'freetime_3', 'freetime_4', 'freetime_5'], 'goout': ['goout_1', 'goout_2', 'goout_3', 'goout_4', 'goout_5'], 'Dalc': ['Dalc_1', 'Dalc_2', 'Dalc_3', 'Dalc_4', 'Dalc_5'], 'Walc': ['Walc_1', 'Walc_2', 'Walc_3', 'Walc_

In [23]:
m_cat_df.head()

Unnamed: 0,Medu_0,Medu_1,Medu_2,Medu_3,Medu_4,Fedu_0,Fedu_1,Fedu_2,Fedu_3,Fedu_4,...,Walc_1,Walc_2,Walc_3,Walc_4,Walc_5,health_1,health_2,health_3,health_4,health_5
0,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


In [24]:
p_cat_df.head()

Unnamed: 0,Medu_0,Medu_1,Medu_2,Medu_3,Medu_4,Fedu_0,Fedu_1,Fedu_2,Fedu_3,Fedu_4,...,Walc_1,Walc_2,Walc_3,Walc_4,Walc_5,health_1,health_2,health_3,health_4,health_5
0,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


In [25]:
print(m_cat_df.shape)
print(p_cat_df.shape)

(395, 69)
(649, 69)


In [26]:
# JOIN 'EM TOGETHER!
m_df = m_cont_df.join(m_bin_df.join(m_cat_df))
p_df = p_cont_df.join(p_bin_df.join(p_cat_df))

In [27]:
m_df.head()

Unnamed: 0,age,absences,G1,G2,G3,school,sex,address,famsize,Pstatus,...,Walc_1,Walc_2,Walc_3,Walc_4,Walc_5,health_1,health_2,health_3,health_4,health_5
0,18,6,5,6,6,1,0,1,1,1,...,1,0,0,0,0,0,0,1,0,0
1,17,4,5,5,6,1,0,1,1,0,...,1,0,0,0,0,0,0,1,0,0
2,15,10,7,8,10,1,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
3,15,2,15,14,15,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
4,16,4,6,10,10,1,0,1,1,0,...,0,1,0,0,0,0,0,0,0,1


In [28]:
m_df.columns

Index(['age', 'absences', 'G1', 'G2', 'G3', 'school', 'sex', 'address',
       'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'higher', 'internet', 'romantic', 'Medu_0', 'Medu_1',
       'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 'Fedu_2', 'Fedu_3',
       'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services',
       'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other',
       'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home',
       'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other', 'traveltime_1', 'traveltime_2',
       'traveltime_3', 'traveltime_4', 'studytime_1', 'studytime_2',
       'studytime_3', 'studytime_4', 'failures_0', 'failures_1', 'failures_2',
       'failures_3', 'famrel_1', 'famrel_2', 'famrel_3', 'famrel_4',
       'famrel_5', 'freetime_1', 'freetime_2', 'freetime_3', 'freetime_4',
       'freetime_5', 'goout_1', 'goout_2', 'goout_3', '

In [29]:
p_df.head()

Unnamed: 0,age,absences,G1,G2,G3,school,sex,address,famsize,Pstatus,...,Walc_1,Walc_2,Walc_3,Walc_4,Walc_5,health_1,health_2,health_3,health_4,health_5
0,18,4,0,11,11,1,0,1,1,1,...,1,0,0,0,0,0,0,1,0,0
1,17,2,9,11,11,1,0,1,1,0,...,1,0,0,0,0,0,0,1,0,0
2,15,6,12,13,12,1,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
3,15,0,14,14,14,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
4,16,0,11,13,13,1,0,1,1,0,...,0,1,0,0,0,0,0,0,0,1


In [31]:
for key in m_baselines.keys():
    print(key, m_baselines[key])

Medu ['Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4']
Fedu ['Fedu_0', 'Fedu_1', 'Fedu_2', 'Fedu_3', 'Fedu_4']
Mjob ['Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher']
Fjob ['Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher']
reason ['reason_course', 'reason_home', 'reason_other', 'reason_reputation']
guardian ['guardian_father', 'guardian_mother', 'guardian_other']
traveltime ['traveltime_1', 'traveltime_2', 'traveltime_3', 'traveltime_4']
studytime ['studytime_1', 'studytime_2', 'studytime_3', 'studytime_4']
failures ['failures_0', 'failures_1', 'failures_2', 'failures_3']
famrel ['famrel_1', 'famrel_2', 'famrel_3', 'famrel_4', 'famrel_5']
freetime ['freetime_1', 'freetime_2', 'freetime_3', 'freetime_4', 'freetime_5']
goout ['goout_1', 'goout_2', 'goout_3', 'goout_4', 'goout_5']
Dalc ['Dalc_1', 'Dalc_2', 'Dalc_3', 'Dalc_4', 'Dalc_5']
Walc ['Walc_1', 'Walc_2', 'Walc_3', 'Walc_4', 'Walc_5']
health ['health_1', 'health_2', 'health_3'

In [32]:
baselines = ['Medu_0', 'Fedu_0', 'Mjob_other', 'Fjob_other', 'reason_other', 'guardian_other', 'traveltime_1', 'studytime_1', 'failures_0', 'famrel_1', 'freetime_1', 'goout_1', 'Dalc_1', 'Walc_1', 'health_1']

In [34]:
math_train, math_holdout = train_test_split(m_df, test_size=0.2, random_state=42)
math_train, math_test = train_test_split(math_train, test_size=0.2, random_state=42)

port_train, port_holdout = train_test_split(p_df, test_size=0.2, random_state=42)
port_train, port_test = train_test_split(port_train, test_size=0.2, random_state=42)

In [36]:
print(math_train.shape)
print(port_train.shape)

print(math_test.shape)
print(port_test.shape)

print(math_holdout.shape)
print(port_holdout.shape)

print(math_train.shape[0] + math_test.shape[0] + math_holdout.shape[0])
print(port_train.shape[0] + port_test.shape[0] + port_holdout.shape[0])

(252, 87)
(415, 87)
(64, 87)
(104, 87)
(79, 87)
(130, 87)
395
649


In [42]:
# Can use same columns for both models, so until we're actually specifying the df, we can use one var
X_train_cols = list(math_train.columns)
print(X_train_cols)

['age', 'absences', 'G1', 'G2', 'G3', 'school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_0', 'Fedu_1', 'Fedu_2', 'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime_1', 'traveltime_2', 'traveltime_3', 'traveltime_4', 'studytime_1', 'studytime_2', 'studytime_3', 'studytime_4', 'failures_0', 'failures_1', 'failures_2', 'failures_3', 'famrel_1', 'famrel_2', 'famrel_3', 'famrel_4', 'famrel_5', 'freetime_1', 'freetime_2', 'freetime_3', 'freetime_4', 'freetime_5', 'goout_1', 'goout_2', 'goout_3', 'goout_4', 'goout_5', 'Dalc_1', 'Dalc_2', 'Dalc_3', 'Dalc_4', 'Dalc_5', 'Walc_1', 'Walc_2', 'Walc_

In [43]:
for base in baselines:
    X_train_cols.remove(base)

print(X_train_cols)

['age', 'absences', 'G1', 'G2', 'G3', 'school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_1', 'Fedu_2', 'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_reputation', 'guardian_father', 'guardian_mother', 'traveltime_2', 'traveltime_3', 'traveltime_4', 'studytime_2', 'studytime_3', 'studytime_4', 'failures_1', 'failures_2', 'failures_3', 'famrel_2', 'famrel_3', 'famrel_4', 'famrel_5', 'freetime_2', 'freetime_3', 'freetime_4', 'freetime_5', 'goout_2', 'goout_3', 'goout_4', 'goout_5', 'Dalc_2', 'Dalc_3', 'Dalc_4', 'Dalc_5', 'Walc_2', 'Walc_3', 'Walc_4', 'Walc_5', 'health_2', 'health_3', 'health_4', 'health_5']


In [44]:
print(len(X_train_cols))

72


In [50]:
full_X_train_cols = X_train_cols.copy()

In [None]:
# Let's run OLS to see which characteristics are significant (regarding test scores)

X_cols_OLS = ['age', 'absences', 'G1', 'G2', 'school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'Medu_1', 'Medu_2', 'Medu_3', 'Medu_4', 'Fedu_1', 'Fedu_2', 'Fedu_3', 'Fedu_4', 'Mjob_at_home', 'Mjob_health', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_reputation', 'guardian_father', 'guardian_mother', 'traveltime_2', 'traveltime_3', 'traveltime_4', 'studytime_2', 'studytime_3', 'studytime_4', 'failures_1', 'failures_2', 'failures_3', 'famrel_2', 'famrel_3', 'famrel_4', 'famrel_5', 'freetime_2', 'freetime_3', 'freetime_4', 'freetime_5', 'goout_2', 'goout_3', 'goout_4', 'goout_5', 'Dalc_2', 'Dalc_3', 'Dalc_4', 'Dalc_5', 'Walc_2', 'Walc_3', 'Walc_4', 'Walc_5', 'health_2', 'health_3', 'health_4', 'health_5']
y_col = 'G3'

X_train_OLS['constant'] = 1


model = sm.OLS(y_train_OLS, X_train_OLS)
results = model.fit()

fitted_vals = results.predict(X_train_OLS)
stu_resid = results.resid_pearson
residuals = results.resid
#y_vals = pd.DataFrame({'residuals':residuals, 'fitted_vals':fitted_vals, 'stu_resid': stu_resid})
print(results.summary())

In [None]:
# MODEL / FEATURE SELECTION 




In [None]:
# Train
#X_train_OLS = sm.add_constant(X_train_OLS)
X_train_OLS['constant'] = 1


model = sm.OLS(y_train_OLS, X_train_OLS)
results = model.fit()

fitted_vals = results.predict(X_train_OLS)
stu_resid = results.resid_pearson
residuals = results.resid
#y_vals = pd.DataFrame({'residuals':residuals, 'fitted_vals':fitted_vals, 'stu_resid': stu_resid})
print(results.summary())

In [None]:
#SPLIT DATA: TRAIN + HOLDOUT
X_train = sm.add_constant(X_train)