In [74]:
import pandas as pd
import numpy as np
from scipy import stats
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns

import wrangle

import sklearn.preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression 
from sklearn.feature_selection import SelectKBest, f_regression, RFE

import warnings
warnings.filterwarnings('ignore')

## Exercise 1

Load the tips dataset.

In [56]:
# import data from pydataset
from pydataset import data

df = data('tips')
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3


### 1a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [57]:
df['tip_percentage'] = df.tip / df.total_bill

### 1b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [58]:
df['price_per_person'] = df.total_bill / df['size']

In [59]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


In [29]:
df.shape

(244, 9)

### 1c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

**I think the features most important for predicting tip amount would be total_bill and party size.**

In [60]:
# drop non-numeric features
df = df[['total_bill', 'tip', 'size', 'tip_percentage', 'price_per_person']]
df.head()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person
1,16.99,1.01,2,0.059447,8.495
2,10.34,1.66,3,0.160542,3.446667
3,21.01,3.5,3,0.166587,7.003333
4,23.68,3.31,2,0.13978,11.84
5,24.59,3.61,4,0.146808,6.1475


In [61]:
# Split the data
train, validate, test = wrangle.split_data(df)
train.shape, validate.shape, test.shape

((136, 5), (59, 5), (49, 5))

In [63]:
# split train into X & y
X_train = train.drop(columns=['tip'])
y_train = train['tip']

# split validate into X & y
X_validate = validate.drop(columns=['tip'])
y_validate = validate['tip']

# split test into X & y
X_test = test.drop(columns=['tip'])
y_test = test['tip']

In [64]:
# Scale (Make the thing)
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler, (fit the thing)
scaler.fit(X_train)

# Use the scaler to transform train, validate, test (use the thing) and convert to dataframe
X_train_scaled = pd.DataFrame(scaler.transform(X_train))
X_validate_scaled = pd.DataFrame(scaler.transform(X_validate))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

In [65]:
# get list of column names
cols = X_train.columns.tolist()
cols

['total_bill', 'size', 'tip_percentage', 'price_per_person']

In [68]:
# Set scaled dataframe column names equal to original dataframe
X_train_scaled.columns = cols
X_validate_scaled.columns = cols
X_test_scaled.columns = cols

### 1d. Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

In [87]:
# use SelectKBest to select top two features

# make the thing
kbest = SelectKBest(f_regression, k=2)

# fit the thing
kbest.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = kbest.get_support()

# use the thing to get list of top K features. 
kbest_features = X_train.iloc[:,feature_mask].columns.tolist()
print(f'kbest_features for predicting tip are: {kbest_features}')

kbest_features for predicting tip are: ['total_bill', 'size']


In [88]:
# check if scaled columns make a difference

# make the thing
kbest = SelectKBest(f_regression, k=2)

# fit the thing
kbest.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = kbest.get_support()

# use the thing to get list of top K features. 
kbest_features = X_train_scaled.iloc[:,feature_mask].columns.tolist()
print(f'kbest_features for predicting tip with scaled columns are: {kbest_features}')

kbest_features for predicting tip with scaled columns are: ['total_bill', 'size']


In [89]:
# use RFE to select top two features

# Make the things
lm = LinearRegression() 
rfe = RFE(lm, n_features_to_select=2)

# Fit the thing
rfe.fit(X_train, y_train)

# use the thing
rfe_columns = X_train.columns[rfe.support_].tolist()
print(f'rfe_features for predicting tip are: {rfe_columns}')

rfe_features for predicting tip are: ['size', 'tip_percentage']


In [90]:
# use RFE to select top two features, check if scaled columns make a difference

# Make the things
lm = LinearRegression() 
rfe = RFE(lm, n_features_to_select=2)

# Fit the thing
rfe.fit(X_train_scaled, y_train)

# use the thing
rfe_columns = X_train_scaled.columns[rfe.support_].tolist()
print(f'rfe_features for predicting tip with scaled columns are: {rfe_columns}')

rfe_features for predicting tip with scaled columns are: ['total_bill', 'tip_percentage']


**Using SelectKBest, the top two features for predicting tip amount are the total bill and party size, whether using scaled values or not**

**Having scaled columns does make a difference when using RFE, likely because it uses actual modeling. Since total_bill is also one of the features when using SelectKBest, the scaled version is what I would use for selecting top features**

### 1e. Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they?

In [77]:
# split train into X & y
X_train_2 = train.drop(columns=['tip_percentage'])
y_train_2 = train['tip_percentage']

# split validate into X & y
X_validate_2 = validate.drop(columns=['tip_percentage'])
y_validate_2 = validate['tip_percentage']

# split test into X & y
X_test_2 = test.drop(columns=['tip_percentage'])
y_test_2 = test['tip_percentage']

In [78]:
# Scale (Make the thing)
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler, (fit the thing)
scaler.fit(X_train_2)

# Use the scaler to transform train, validate, test (use the thing) and convert to dataframe
X_train_scaled_2 = pd.DataFrame(scaler.transform(X_train_2))
X_validate_scaled_2 = pd.DataFrame(scaler.transform(X_validate_2))
X_test_scaled_2 = pd.DataFrame(scaler.transform(X_test_2))

In [79]:
# Set scaled dataframe column names equal to original dataframe
X_train_scaled_2.columns = cols
X_validate_scaled_2.columns = cols
X_test_scaled_2.columns = cols

In [86]:
# use SelectKBest to select top two features

# make the thing
kbest = SelectKBest(f_regression, k=2)

# fit the thing
kbest.fit(X_train_2, y_train_2)

# boolean mask of whether the column was selected or not. 
feature_mask = kbest.get_support()

# use the thing to get list of top K features. 
kbest_features = X_train_2.iloc[:,feature_mask].columns.tolist()
print(f'kbest_features for predicting tip percentage are: {kbest_features}')

kbest_features for predicting tip percentage are: ['total_bill', 'tip']


In [91]:
# use RFE to select top two features

# Make the things
lm = LinearRegression() 
rfe = RFE(lm, n_features_to_select=2)

# Fit the thing
rfe.fit(X_train_scaled_2, y_train)

# use the thing
rfe_columns = X_train_scaled_2.columns[rfe.support_].tolist()
print(f'rfe_features for predicting tip percentage with scaled columns are: {rfe_columns}')

rfe_features for predicting tip percentage with scaled columns are: ['total_bill', 'size']


### 1f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

In [92]:
# use SelectKBest to select top three features

# make the thing
kbest = SelectKBest(f_regression, k=3)

# fit the thing
kbest.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = kbest.get_support()

# use the thing to get list of top K features. 
kbest_features = X_train.iloc[:,feature_mask].columns.tolist()
print(f'kbest_features for predicting tip are: {kbest_features}')

kbest_features for predicting tip are: ['total_bill', 'size', 'price_per_person']


In [93]:
# use RFE to select top three features

# Make the things
lm = LinearRegression() 
rfe = RFE(lm, n_features_to_select=3)

# Fit the thing
rfe.fit(X_train, y_train)

# use the thing
rfe_columns = X_train.columns[rfe.support_].tolist()
print(f'rfe_features for predicting tip are: {rfe_columns}')

rfe_features for predicting tip are: ['size', 'tip_percentage', 'price_per_person']


In [94]:
# use SelectKBest to select top four features

# make the thing
kbest = SelectKBest(f_regression, k=4)

# fit the thing
kbest.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = kbest.get_support()

# use the thing to get list of top K features. 
kbest_features = X_train.iloc[:,feature_mask].columns.tolist()
print(f'kbest_features for predicting tip are: {kbest_features}')

kbest_features for predicting tip are: ['total_bill', 'size', 'tip_percentage', 'price_per_person']


In [95]:
# use RFE to select top four features

# Make the things
lm = LinearRegression() 
rfe = RFE(lm, n_features_to_select=4)

# Fit the thing
rfe.fit(X_train, y_train)

# use the thing
rfe_columns = X_train.columns[rfe.support_].tolist()
print(f'rfe_features for predicting tip are: {rfe_columns}')

rfe_features for predicting tip are: ['total_bill', 'size', 'tip_percentage', 'price_per_person']


**SelectKBest is based on the statistical result between X and y while RFE is based on modeling and the importance of each feature using coefficients**

## Exercise 2

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [96]:
def select_kbest(X, y, k):
    '''
    This function takes in the predictors (X), target (y), and number of features to select (k) and returns
    the names of the top k selected features based on the SelectKBest class
    '''
    # make the thing
    kbest = SelectKBest(f_regression, k=k)

    # fit the thing
    kbest.fit(X, y)

    # boolean mask of whether the column was selected or not. 
    feature_mask = kbest.get_support()

    # use the thing to get list of top K features. 
    kbest_features = X.iloc[:,feature_mask].columns.tolist()
    
    return kbest_features

In [97]:
# check the function
select_kbest(X_train, y_train, 4)

['total_bill', 'size', 'tip_percentage', 'price_per_person']

## Exercise 3

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [100]:
def rfe(X, y, k):
    '''
    This function takes in the predictors (X), target (y), and number of features to select (k) and returns
    the names of the top k selected features based on the RFE class
    '''
    # Make the things
    lm = LinearRegression() 
    rfe = RFE(lm, n_features_to_select=k)

    # Fit the thing
    rfe.fit(X, y)

    # use the thing
    rfe_columns = X.columns[rfe.support_].tolist()
    
    return rfe_columns

In [101]:
# check the function
rfe(X_train, y_train, 4)

['total_bill', 'size', 'tip_percentage', 'price_per_person']

## Exercise 4

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [102]:
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [103]:
df.shape

(47, 6)

In [104]:
# split df into X & y, not splitting into test, train, and validate because of the size of df and will not be modeling for this exercise
X_df = df.drop(columns=['Fertility'])
y_df = df['Fertility']

In [106]:
# Scale (Make the thing)
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler, (fit the thing)
scaler.fit(X_df)

# Use the scaler to transform df (use the thing) and convert to dataframe
X_df_scaled = pd.DataFrame(scaler.transform(X_df))

In [108]:
X_df_scaled.columns = X_df.columns.tolist()
X_df_scaled.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
0,0.178531,0.352941,0.211538,0.079816,0.721519
1,0.496045,0.088235,0.153846,0.845069,0.721519
2,0.435028,0.058824,0.076923,0.93255,0.594937
3,0.39887,0.264706,0.115385,0.323148,0.601266
4,0.477966,0.411765,0.269231,0.030761,0.620253


In [109]:
# use function to find top 3 features using SelectKBest
select_kbest(X_df, y_df, 3)

['Examination', 'Education', 'Catholic']

In [110]:
# use function to find top 3 features using rfe
rfe(X_df_scaled, y_df, 3)

['Agriculture', 'Education', 'Infant.Mortality']