In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from pydataset import data

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

from wrangle import split_data

In [2]:
df = data('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# Get dummy variables for sex, smoker, and time
df = pd.get_dummies(df, columns = ['sex','smoker','time'],drop_first=True)

In [5]:
# Convert day to number - this is ordinal data
df.day = df['day'].astype('category')
df.day = df.day.cat.reorder_categories(['Thur','Fri','Sat','Sun'])
df.day = df.day.cat.codes

In [6]:
df['price_per_person'] = df.total_bill/df.size

In [7]:
train, validate, test = split_data(df)

Data split as follows: Train 56.00%, Validate 24.00%, Test 20.00%


In [8]:
train = train.reset_index()

### Which features do I think will be most important for predicting tip amount?
- total_bill
- size

Use k best to select top 2 features for predicting tip amount

In [9]:
# Scale total bill, size of part, and day of week data
scale = MinMaxScaler()
scale.fit(train[['total_bill','size','day']])
scaled_data = scale.transform(train[['total_bill','size','day']])

scaled_data_df = pd.DataFrame(data = scaled_data, columns = ['total_bill_scaled','size_scaled','day_scaled'])

train = pd.concat([train, scaled_data_df], axis = 1)

In [10]:
train.columns

Index(['index', 'total_bill', 'tip', 'day', 'size', 'sex_Male', 'smoker_Yes',
       'time_Lunch', 'price_per_person', 'total_bill_scaled', 'size_scaled',
       'day_scaled'],
      dtype='object')

In [11]:
X_scaled = train[['total_bill_scaled','size_scaled','price_per_person','day_scaled','sex_Male','smoker_Yes','time_Lunch']]
y_train = train.tip

In [12]:
# f_regression stats test for top 2
f_selector = SelectKBest(f_regression, k=2)
# find the top 2 X's correlated with y
f_selector.fit(X_scaled,y_train)
# Boolean mask of whether the column was selected or now
feature_mask = f_selector.get_support()
# List of top k features
f_feature = X_scaled.iloc[:,feature_mask].columns.tolist()

In [13]:
print(f'Top 2 features based on SelectKBest are {f_feature}')

Top 2 features based on SelectKBest are ['total_bill_scaled', 'price_per_person']


Use recursive feature elimination to select top 2

In [14]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=2)

# fit the data using RFE
rfe.fit(X_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_scaled.iloc[:,feature_mask].columns.tolist()


In [15]:
print(f'Top 2 features based on RFE {rfe_feature}')

Top 2 features based on RFE ['total_bill_scaled', 'size_scaled']


In [16]:
# view list of columns and their ranking

# get the ranks
var_ranks = rfe.ranking_
# get the variable names
var_names = X_scaled.columns.tolist()
# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})
# sort the df by rank
rfe_ranks_df.sort_values('Rank')


Unnamed: 0,Var,Rank
0,total_bill_scaled,1
1,size_scaled,1
4,sex_Male,2
3,day_scaled,3
2,price_per_person,4
5,smoker_Yes,5
6,time_Lunch,6


### I think k best chooses total_bill_scaled and price per person because they are highly correlated and it is evaluating each feature in isolation. Clearly total bill is most important and price per person is directly proportional to total bill. For recursive feature elimination it takes into account interaction between features.

## 2. Function for select k best

In [20]:
def select_kbest(X, y, k):
    """ Takes in predictors (X), target (y) , and number of features to select (k) and returns the names 
    of the top k selected features based on the SelectKBest class."""
    # f_regression stats test for top 2
    f_selector = SelectKBest(f_regression, k=k)
    # find the top 2 X's correlated with y
    f_selector.fit(X,y)
    # Boolean mask of whether the column was selected or now
    feature_mask = f_selector.get_support()
    # List of top k features
    return X.iloc[:,feature_mask].columns.tolist()

In [21]:
select_kbest(X_scaled, y_train, k=2)

['total_bill_scaled', 'price_per_person']

## 3. Function for rfe

In [24]:
def rfe(X,y,k):
    """ Takes in predictors (X), target (y) , and number of features to select (k) and returns the names 
    of the top k selected features based on the RFE class."""
    # initialize the ML algorithm
    lm = LinearRegression()

    # create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
    rfe = RFE(lm, n_features_to_select=k)

    # fit the data using RFE
    rfe.fit(X,y)  

    # get the mask of the columns selected
    feature_mask = rfe.support_

    # get list of the column names. 
    return X.iloc[:,feature_mask].columns.tolist()

In [25]:
rfe(X_scaled, y_train,2)

['total_bill_scaled', 'size_scaled']

## 4. Swiss dataset, predict fertility. Find top 3 features using both kbest and rfe

In [26]:
swiss = data('swiss')

In [57]:
train, validate, test = split_data(swiss)

Data split as follows: Train 56.00%, Validate 24.00%, Test 20.00%


In [58]:
train = train.reset_index()

In [59]:
train.columns

Index(['index', 'Fertility', 'Agriculture', 'Examination', 'Education',
       'Catholic', 'Infant.Mortality'],
      dtype='object')

In [61]:
# Scale the data
scaler = MinMaxScaler()

scaled_data = scaler.fit_transform(train[['Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality']])
scaled_data_df = pd.DataFrame(data = scaled_data, columns = ['Agriculture_scaled', 'Examination_scaled', 'Education_scaled', 'Catholic_scaled',
       'Infant.Mortality_scaled'])

In [62]:
train = pd.concat([train, scaled_data_df], axis = 1)

In [64]:
X = train[['Agriculture_scaled', 'Examination_scaled', 'Education_scaled', 'Catholic_scaled',
       'Infant.Mortality_scaled']]
y = train.Fertility

In [65]:
select_kbest(X, y, 3)

['Examination_scaled', 'Catholic_scaled', 'Infant.Mortality_scaled']

In [66]:
rfe(X,y,3)

['Agriculture_scaled', 'Examination_scaled', 'Infant.Mortality_scaled']

### Checking if scaled data makes a difference

In [68]:
X = train[['Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality']]
select_kbest(X, y, 3)

['Examination', 'Catholic', 'Infant.Mortality']

In [67]:
rfe(X,y,3)

['Agriculture', 'Examination', 'Infant.Mortality']

### Same results scaled or not