In [37]:
import pandas as pd
import numpy as np
from scipy import stats
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns
import math

import sklearn.preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression 
from sklearn.feature_selection import SelectKBest, f_regression, RFE

import warnings
warnings.filterwarnings('ignore')

from pydataset import data

#### 1.) Load the tips dataset.


In [38]:
df = data('tips')
df.head(2)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3


##### Create a column named price_per_person. This should be the total bill divided by the party size.

In [39]:
df = df.rename(columns={'size':'party_size'})
df['price_per_person'] = df.total_bill / df.party_size

df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,party_size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667


##### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

 - Tip Percentage

In [40]:
df['tip_percentage'] = df.tip / df.total_bill

###### Use select k best to select the top 2 features for predicting tip amount. What are they?

In [41]:
# Pay attention to numeric features
df = df[['total_bill', 'tip', 'party_size', 
         'price_per_person','tip_percentage']]
df.head(2)

Unnamed: 0,total_bill,tip,party_size,price_per_person,tip_percentage
1,16.99,1.01,2,8.495,0.059447
2,10.34,1.66,3,3.446667,0.160542


In [42]:
train_validate, test = train_test_split(df, 
                                        test_size=0.2, 
                                        random_state=123)

train, validate = train_test_split(train_validate, 
                                   test_size=0.3, 
                                   random_state=123)

train.shape, validate.shape, test.shape

((136, 5), (59, 5), (49, 5))

In [43]:
# Define the X and y's while dropping the target 
target = 'tip'
    
# split train into X (dataframe, drop target) & y (series, keep target only)
x_train = train.drop(columns=[target])
y_train = train[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
x_val = validate.drop(columns=[target])
y_val = validate[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
x_test = test.drop(columns=[target])
y_test = test[target]


In [44]:
# Scale the numerical data to make it easier to compare
   
#Make the scaler
scaler = sklearn.preprocessing.MinMaxScaler()

#Fit the scaler
scaler.fit(x_train)

# Use the scaler
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)
    

In [70]:
# use SelectKBest to select top two features

# make the thing
kbest = SelectKBest(f_regression, k=2)

# fit the thing
kbest.fit(x_train, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = kbest.get_support()

# use the thing to get list of top K features. 
kbest = x_train.iloc[:,feature_mask].columns.tolist()
print(f'kbest for predicting tip are: {kbest}')

kbest for predicting tip are: ['Examination', 'Catholic']


##### Use recursive feature elimination to select the top 2 features for tip amount. What are they?


In [54]:
# use RFE to select top two features

# Make the things
lm = LinearRegression() 
rfe = RFE(lm, n_features_to_select=2)

# Fit the thing
rfe.fit(x_train, y_train)

# use the thing
rfe = x_train.columns[rfe.support_].tolist()
print(f'rfe for predicting tip are: {rfe}')

rfe for predicting tip are: ['party_size', 'tip_percentage']


##### Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

 - Because....

 - It does change. total_bill is included when increasing RFE size to 3

#### 2.) Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [73]:
def select_kbest(x, y, k):

    # Mae the model
    kbest = SelectKBest(f_regression, k=2)
    
    # Fit the model
    kbest.fit(x_train, y_train)
    
    # Boolean mask 
    mask = kbest.get_support()
    
    # use the thing to get list of top K features. 
    return x.columns[mask]


In [74]:
select_kbest(x_train_scaled, y_train, 2)

Index(['Examination', 'Catholic'], dtype='object')

#### 3.) Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [95]:
def rfe(x, y, k): 
    # Make the things
    lm = LinearRegression() 
    rfe = RFE(lm, n_features_to_select=2)
    
    # Fit the thing
    rfe.fit(x_train, y_train)
    # Boolean mask
    mask = rfe.get_support()
    return x.columns[mask]


In [96]:
rfe(x_train, y_train, 2)


Index(['Examination', 'Infant.Mortality'], dtype='object')

#### 4.) Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [106]:
df = data('swiss')


In [107]:
df.head()


Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [108]:
train_validate, test = train_test_split(df, 
                                        test_size=.2, 
                                        random_state=123)

train, validate = train_test_split(train_validate, 
                                   test_size=.3, 
                                   random_state=123)

train.shape, validate.shape, test.shape


((25, 6), (12, 6), (10, 6))

In [109]:
target = 'Fertility'
# split train into X (dataframe, drop target) & y (series, keep target only)
x_train = train.drop(columns=[target])
y_train = train[target]

# split validate into X (dataframe, drop target) & y (series, keep target only)
x_val = validate.drop(columns=[target])
y_val = validate[target]

# split test into X (dataframe, drop target) & y (series, keep target only)
x_test = test.drop(columns=[target])
y_test = test[target]


In [110]:
#Make the scaler
scaler = sklearn.preprocessing.MinMaxScaler()

#Fit the scaler
scaler.fit(x_train)

# Use the scaler
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)


In [111]:
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns)
x_val_scaled = pd.DataFrame(x_val_scaled, columns=x_train.columns)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_train.columns)


In [112]:
select_kbest(x_train, y_train, 3)


Index(['Examination', 'Catholic'], dtype='object')

In [113]:
rfe(x_train, y_train, 3)


Index(['Examination', 'Infant.Mortality'], dtype='object')

In [115]:
df[['Education']]

Unnamed: 0,Education
Courtelary,12
Delemont,9
Franches-Mnt,5
Moutier,7
Neuveville,15
Porrentruy,7
Broye,7
Glane,8
Gruyere,7
Sarine,13
