In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sklearn.preprocessing

import env

import warnings
warnings.filterwarnings("ignore")

In [2]:
def create_dummies(df, object_cols):
    '''
    This function takes in a dataframe and list of object column names,
    and creates dummy variables of each of those columns. 
    It then appends the dummy variables to the original dataframe. 
    It returns the original df with the appended dummy variables. 
    '''
    
    # run pd.get_dummies() to create dummy vars for the object columns. 
    # we will drop the column representing the first unique value of each variable
    # we will opt to not create na columns for each variable with missing values 
    # (all missing values have been removed.)
    dummy_df = pd.get_dummies(df[object_cols], dummy_na=False, drop_first=True)
    
    # concatenate the dataframe with dummies to our original dataframe
    # via column (axis=1)
    df = pd.concat([df, dummy_df], axis=1)

    return df

# 1. Load the tips dataset.

In [3]:
from pydataset import data

tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [5]:
object_cols =['sex', 'smoker', 'day', 'time']

In [6]:
tips = create_dummies(tips, object_cols)

In [7]:
tips = tips.drop(columns = object_cols)

tips.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,2,0,0,0,1,0,0
2,10.34,1.66,3,1,0,0,1,0,0
3,21.01,3.5,3,1,0,0,1,0,0
4,23.68,3.31,2,1,0,0,1,0,0
5,24.59,3.61,4,0,0,0,1,0,0


## A. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [8]:
tips['tip_percentage'] = tips['tip'] / tips['total_bill']

tips.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch,tip_percentage
1,16.99,1.01,2,0,0,0,1,0,0,0.059447
2,10.34,1.66,3,1,0,0,1,0,0,0.160542
3,21.01,3.5,3,1,0,0,1,0,0,0.166587
4,23.68,3.31,2,1,0,0,1,0,0,0.13978
5,24.59,3.61,4,0,0,0,1,0,0,0.146808


## B. Create a column named price_per_person. This should be the total bill divided by the party size.

In [9]:
tips['price_per_person'] = tips['total_bill'] / tips['size']

tips.head()

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch,tip_percentage,price_per_person
1,16.99,1.01,2,0,0,0,1,0,0,0.059447,8.495
2,10.34,1.66,3,1,0,0,1,0,0,0.160542,3.446667
3,21.01,3.5,3,1,0,0,1,0,0,0.166587,7.003333
4,23.68,3.31,2,1,0,0,1,0,0,0.13978,11.84
5,24.59,3.61,4,0,0,0,1,0,0,0.146808,6.1475


## C. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [10]:
## I predict that that total bill and the size may be the most important features

## D. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [11]:
from wrangle import split

In [12]:
train, validate, test = split(tips)

In [13]:
train.shape

(136, 11)

In [14]:
validate.shape

(59, 11)

In [15]:
test.shape

(49, 11)

In [16]:

X_train = train.drop(columns=['tip'])
y_train = train.tip

X_validate = validate.drop(columns=['tip'])
y_validate = validate.tip

X_test = test.drop(columns=['tip'])
y_test = test.tip

#### SelectKBest

In [17]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 8 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 8 X's correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [18]:
f_feature

['total_bill', 'price_per_person']

In [19]:
## SelectKBest chose total bill and price per person as the two most important features

#### Recursive Feature Elimination

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(X_train,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()

In [22]:
rfe_feature

['day_Sun', 'tip_percentage']

In [23]:
## RFE chose sunday and tip percentage as the best variables to predict tip

## E. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [24]:
X_train2 = train.drop(columns=['tip_percentage'])
y_train2 = train.tip_percentage

X_validate2 = validate.drop(columns=['tip_percentage'])
y_validate2 = validate.tip_percentage

X_test2 = test.drop(columns=['tip_percentage'])
y_test2 = test.tip_percentage

#### SelectKBest

In [25]:
from sklearn.feature_selection import SelectKBest, f_regression

# parameters: f_regression stats test, give me 8 features
f_selector = SelectKBest(f_regression, k=2)

# find the top 8 X's correlated with y
f_selector.fit(X_train2, y_train2)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train2.iloc[:,feature_mask].columns.tolist()

In [26]:
f_feature

['total_bill', 'tip']

In [None]:
## SelectKBest chose total bill and tip as the best variables for predicting tip percent

#### RFE

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(X_train2,y_train2)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train2.iloc[:,feature_mask].columns.tolist()

In [28]:
rfe_feature

['tip', 'smoker_Yes']

In [29]:
## RFE chose tip and smoker as the best variables for predicting tip percent

## F. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

- Select K Best evaluates the way variables interact with eachother, and runs statistical tests to see which variables have the strongest relationship to the target variable

- RFE runs a model and finds the least valuable variable and removes it one by one until only the desired amount of variables remain

# 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [30]:
def select_kbest(X, y, amount):
    # parameters: f_regression stats test, give me 8 features
    f_selector = SelectKBest(f_regression, k=amount)

    # find the top 8 X's correlated with y
    f_selector.fit(X, y)

    # boolean mask of whether the column was selected or not. 
    feature_mask = f_selector.get_support()

    # get list of top K features. 
    f_feature = X.iloc[:,feature_mask].columns.tolist()
    
    return f_feature

In [31]:
select_kbest(X_train, y_train, 2)

['total_bill', 'price_per_person']

# 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [32]:
def rfe( X, y, amount):
    # initialize the ML algorithm
    lm = LinearRegression()

    # create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
    rfe = RFE(lm, amount)

    # fit the data using RFE
    rfe.fit(X,y)  

    # get the mask of the columns selected
    feature_mask = rfe.support_

    # get list of the column names. 
    rfe_feature = X.iloc[:,feature_mask].columns.tolist()
    
    return rfe_feature

In [33]:
rfe(X_train, y_train, 2)

['day_Sun', 'tip_percentage']

# 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [34]:
from pydataset import data

swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [38]:
list(swiss.columns)

['Fertility',
 'Agriculture',
 'Examination',
 'Education',
 'Catholic',
 'Infant.Mortality']

In [35]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [36]:
swiss.shape

(47, 6)

In [39]:
X_train = swiss.drop(columns=['Fertility'])
y_train = swiss.Fertility

In [40]:
X_train.shape

(47, 5)

In [41]:
y_train.shape

(47,)

In [42]:
select_kbest(X_train, y_train, 3)

['Examination', 'Education', 'Catholic']

In [43]:
rfe(X_train, y_train, 3)

['Examination', 'Education', 'Infant.Mortality']