In [2]:
import warnings
warnings.filterwarnings("ignore")

from pydataset import data

import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

## 1. Load the tips dataset from either pydataset or seaborn.

In [3]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = df['total_bill']/df['size']
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333


### b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

*I think the most important features will be time, size and total_bill.*

### c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [5]:
# Assign variables
X = df[['size', 'total_bill', 'price_per_person']]
y = df['tip']

In [6]:
# Create select k best and fit
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X, y)
print('Top 2 features according to k-best:', X.columns[kbest.get_support()])

Top 2 features according to k-best: Index(['size', 'total_bill'], dtype='object')


In [6]:
# Getting results
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=X.columns)
kbest_results

Unnamed: 0,p,f
size,4.300543e-16,76.175426
total_bill,6.692471e-34,203.357723
price_per_person,2.502102e-08,33.213257


### d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [7]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)
rfe.fit(X, y)
print('Top 2 features according to RFE:', X.columns[rfe.get_support()])

Top 2 features according to RFE:


Index(['total_bill', 'price_per_person'], dtype='object')

### e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

*They select best features using different methods/criteria. While K-best evaluates each feature individually, RFE evaluates features as combinations. The more feautures you select the more similar I would expect the answers to be.*

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [8]:
# Create function 

def select_kbest(X, y, k):
    # Create and fit selectkbest
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    # get mask of the columns selected
    mask = kbest.get_support()
    # return the selected features
    return X.columns[mask]

In [9]:
# Test function -- same as manual results 
select_kbest(X, y, 2)

Index(['size', 'total_bill'], dtype='object')

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [10]:
# Create function 
def rfe(X, y, k, model=LinearRegression()):
    # Create and fit RFE
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    # get mask of the columns selected
    mask = rfe.get_support()
    # Return the selected features
    return X.columns[mask]

In [11]:
# Test function -- same as manual results
rfe(X, y, 2)

Index(['total_bill', 'price_per_person'], dtype='object')

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [12]:
# Load the data 
df = data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [13]:
# Assign my variables
X = df.drop(columns=['Fertility'])
y = df['Fertility']

In [16]:
# Top 3 features with select k best
print(select_kbest(X, y, 3))

Index(['Examination', 'Education', 'Catholic'], dtype='object')


In [15]:
# Top 3 features using RFE
print(rfe(X, y, 3))

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')