In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import env

import warnings
warnings.filterwarnings("ignore")

In [2]:
def train_validate_test(df, target):
    '''
    this function takes in a dataframe and splits it into 3 samples, 
    a test, which is 20% of the entire dataframe, 
    a validate, which is 24% of the entire dataframe,
    and a train, which is 56% of the entire dataframe. 
    It then splits each of the 3 samples into a dataframe with independent variables
    and a series with the dependent, or target variable. 
    The function returns 3 dataframes and 3 series:
    X_train (df) & y_train (series), X_validate & y_validate, X_test & y_test. 
    '''
    # split df into test (20%) and train_validate (80%)
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)

    # split train_validate off into train (70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

        
    # split train into X (dataframe, drop target) & y (series, keep target only)
    X_train = train.drop(columns=[target])
    y_train = train[target]
    
    # split validate into X (dataframe, drop target) & y (series, keep target only)
    X_validate = validate.drop(columns=[target])
    y_validate = validate[target]
    
    # split test into X (dataframe, drop target) & y (series, keep target only)
    X_test = test.drop(columns=[target])
    y_test = test[target]
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

def get_numeric_X_cols(X_train, object_cols):
    '''
    takes in a dataframe and list of object column names
    and returns a list of all other columns names, the non-objects. 
    '''
    numeric_cols = [col for col in X_train.columns.values if col not in object_cols]
    
    return numeric_cols


def min_max_scale(X_train, X_validate, X_test, numeric_cols):
    '''
    this function takes in 3 dataframes with the same columns, 
    a list of numeric column names (because the scaler can only work with numeric columns),
    and fits a min-max scaler to the first dataframe and transforms all
    3 dataframes using that scaler. 
    it returns 3 dataframes with the same column names and scaled values. 
    '''
    # create the scaler object and fit it to X_train (i.e. identify min and max)
    # if copy = false, inplace row normalization happens and avoids a copy (if the input is already a numpy array).


    scaler = MinMaxScaler(copy=True).fit(X_train[numeric_cols])

    #scale X_train, X_validate, X_test using the mins and maxes stored in the scaler derived from X_train. 
    # 
    X_train_scaled_array = scaler.transform(X_train[numeric_cols])
    X_validate_scaled_array = scaler.transform(X_validate[numeric_cols])
    X_test_scaled_array = scaler.transform(X_test[numeric_cols])

    # convert arrays to dataframes
    X_train_scaled = pd.DataFrame(X_train_scaled_array, 
                                  columns=numeric_cols).\
                                  set_index([X_train.index.values])

    X_validate_scaled = pd.DataFrame(X_validate_scaled_array, 
                                     columns=numeric_cols).\
                                     set_index([X_validate.index.values])

    X_test_scaled = pd.DataFrame(X_test_scaled_array, 
                                 columns=numeric_cols).\
                                 set_index([X_test.index.values])

    
    return X_train_scaled, X_validate_scaled, X_test_scaled

In [3]:
def create_dummies(df, object_cols):
    '''
    This function takes in a dataframe and list of object column names,
    and creates dummy variables of each of those columns. 
    It then appends the dummy variables to the original dataframe. 
    It returns the original df with the appended dummy variables. 
    '''
    
    # run pd.get_dummies() to create dummy vars for the object columns. 
    # we will drop the column representing the first unique value of each variable
    # we will opt to not create na columns for each variable with missing values 
    # (all missing values have been removed.)
    dummy_df = pd.get_dummies(df[object_cols], dummy_na=False, drop_first=True)
    
    # concatenate the dataframe with dummies to our original dataframe
    # via column (axis=1)
    df = pd.concat([df, dummy_df], axis=1)

    return df

In [4]:
def get_object_cols(df):
    '''
    This function takes in a dataframe and identifies the columns that are object types
    and returns a list of those column names. 
    '''
    # create a mask of columns whether they are object type or not
    mask = np.array(df.dtypes == "object")

        
    # get a list of the column names that are objects (from the mask)
    object_cols = df.iloc[:, mask].columns.tolist()
    
    return object_cols

In [5]:
def wrangle_student_math(path):
    df = pd.read_csv(path, sep=";")
    
    # drop any nulls
    df = df[~df.isnull()]

    # get object column names
    object_cols = get_object_cols(df)
    
    # create dummy vars
    df = create_dummies(df, object_cols)
      
    # split data 
    X_train, y_train, X_validate, y_validate, X_test, y_test = train_validate_test(df, 'G3')
    
    # get numeric column names
    numeric_cols = get_numeric_X_cols(X_train, object_cols)

    # scale data 
    X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)
    
    return df, X_train, X_train_scaled, y_train, X_validate_scaled, y_validate, X_test_scaled, y_test

In [6]:
# Here's the source for the dataset and data dictionary https://archive.ics.uci.edu/ml/datasets/student+performance
path = "https://gist.githubusercontent.com/ryanorsinger/55ccfd2f7820af169baea5aad3a9c60d/raw/da6c5a33307ed7ee207bd119d3361062a1d1c07e/student-mat.csv"

df, X_train_explore, \
    X_train_scaled, y_train, \
    X_validate_scaled, y_validate, \
    X_test_scaled, y_test = wrangle_student_math(path)

In [7]:
X_train_explore.shape


(221, 58)

# The Short Lesson

## SelectKBest

In [8]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 8 features
f_selector = SelectKBest(f_regression, k=8)

# find the top 8 X's correlated with y
f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [9]:
f_feature

['age', 'Medu', 'Fedu', 'failures', 'G1', 'G2', 'sex_M', 'higher_yes']

In [10]:
## this is the list of the top 8 features in decending order from least to most important to predict the target

## this list was generated by selecting only the best features from the dataset

## Recursive Feature Elimination

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(X_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [12]:
rfe_feature

['G1', 'G2']

In [13]:
## This is the list of the 2 most important features for predicting the target

## this list was generated by finding the least important variable and eliminating it, reevaluating and eliminating 
## again until only the desired amount of variables remain

In [14]:
# view list of columns and their ranking

# get the ranks
var_ranks = rfe.ranking_
# get the variable names
var_names = X_train_scaled.columns.tolist()
# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
14,G2,1
13,G1,1
12,absences,2
0,age,3
6,famrel,4
3,traveltime,5
20,Mjob_health,6
22,Mjob_services,7
21,Mjob_other,8
5,failures,9


In [15]:
## this is the rank of variables and the order in which they were eliminated from the table

# The Longer Lesson

## Select K Best

In [16]:
from sklearn.feature_selection import SelectKBest, f_regression

In [17]:
## initiate object, SelectKBest will select the best variables to predict our target

In [18]:
f_selector = SelectKBest(f_regression, k=2)

In [19]:
## select k amount of features we want

In [20]:
f_selector.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7f8330e1ed30>)

In [21]:
## fit object to train data

### make sure train data is scaled for best results!!!!!

In [22]:
X_reduced = f_selector.transform(X_train_scaled)

print(X_train_scaled.shape)
print(X_reduced.shape)

(221, 41)
(221, 2)


In [23]:
## transform trai ndata to only include the variables chosen by SelectKBest

In [24]:
X_reduced2 = SelectKBest(f_regression, k=2).fit_transform(X_train_scaled, y_train)
print(X_reduced2.shape)

(221, 2)


In [25]:
## simplified version

In [26]:
f_support = f_selector.get_support()

print(f_support) 

[False False False False False False False False False False False False
 False  True  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False]


In [27]:
## will provide a boolean list of which variables are and are not within the k amount of best variables

## paired with a list of variables using the same index you can create your list of best variables

In [28]:
f_feature = X_train_scaled.loc[:,f_support].columns.tolist()

# you could also get the list this way (among many others)
# f_feature = [X_train_scaled.columns.values[i] for i in range(len(feature_mask)) if feature_mask[i]==True]

print(str(len(f_feature)), 'selected features')
print(f_feature)

2 selected features
['G1', 'G2']


## Recursive Feature Elimination

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [30]:
lm = LinearRegression()

In [31]:
## inititialize object

In [32]:
rfe = RFE(lm, 2)

In [33]:
## inititiate the RFE and select how many most important features you want

In [34]:
# Transforming data using RFE
X_rfe = rfe.fit_transform(X_train_scaled,y_train)  

In [35]:
## transform the data and fit the model at the same time by making a new variable 

In [36]:
#Fitting the data to model
lm.fit(X_rfe,y_train)

LinearRegression()

In [37]:
mask = rfe.support_

In [38]:
mask

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [39]:
## an array of booleans depicting which features were and were not selected by RFE

In [40]:
rfe_features = X_train_scaled.loc[:,mask].columns.tolist()

In [41]:
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['G1', 'G2']


In [42]:
## when combined with the mask we get a new list that is only the selected features 

In [43]:
var_ranks = rfe.ranking_
var_names = X_train_scaled.columns.tolist()

pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

Unnamed: 0,Var,Rank
0,age,3
1,Medu,13
2,Fedu,15
3,traveltime,5
4,studytime,34
5,failures,9
6,famrel,4
7,freetime,39
8,goout,18
9,Dalc,19


In [44]:
## a ranking of features, selected features will be ranked 1

## features will be ranked accroding to how important they were, the lower the number the more important they were

### optional step but could be useful for adjusting models