In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

from pydataset import data

import split_scale
import evaluate
import explore
import wrangle


import warnings
warnings.filterwarnings("ignore")

### 1.) Load the tips dataset.

In [2]:
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


- a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
tips['tip_percentage'] = tips.tip / tips.total_bill

In [4]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


- b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [5]:
tips['price_per_person'] = tips['total_bill'] / tips['size']

- c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

*tip amount: total_bill, size, time, day*

*tip percentage: total_bill, size, day*

- d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [6]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


In [7]:
tips = tips.drop(columns=['sex', 'smoker', 'day', 'time'])

In [8]:
train, test = split_scale.split_my_data(tips)

In [9]:
scaler, train, test = split_scale.standard_scaler(train, test)

In [10]:
train

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person
43,-0.679468,0.016730,-0.613572,0.973362,-0.323915
190,0.343548,0.692199,0.383482,0.213819,-0.045955
204,-0.404727,-0.385677,-0.613572,-0.125726,0.144429
64,-0.193647,0.519739,1.380537,0.745011,-1.236806
12,1.701613,1.410783,1.380537,-0.300000,0.378601
...,...,...,...,...,...
42,-0.286344,-0.356933,-0.613572,-0.239836,0.346236
4,0.408324,0.196376,-0.613572,-0.333156,1.530423
117,1.106343,1.461084,1.380537,0.152125,-0.128772
52,-1.087110,-0.313818,-0.613572,1.516740,-1.018816


In [11]:
X_train = train.drop(columns='tip')
y_train = train['tip']
X_test = test.drop(columns='tip')
y_test = test['tip']

### RFE feature selection

In [12]:
lm = LinearRegression()
rfe = RFE(lm, 2)

In [13]:
X_rfe = rfe.fit_transform(X_train, y_train)

In [14]:
lm.fit(X_rfe, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
mask = rfe.support_
rfe_features = X_train.loc[:,mask].columns.tolist()

In [16]:
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['total_bill', 'tip_percentage']


### KBest feature selection

In [17]:
f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(X_train, y_train)
X_reduced = f_selector.transform(X_train)

In [18]:
f_support = f_selector.get_support()

print(f_support) 

[ True  True False False]


In [19]:
f_feature = X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)

2 selected features
['total_bill', 'size']


- e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

### kbest

In [20]:
X_train = train.drop(columns='tip_percentage')
y_train = train['tip_percentage']
X_test = test.drop(columns='tip_percentage')
y_test = test['tip_percentage']

In [21]:
X_reduced = SelectKBest(f_regression, k=2).fit_transform(X_train, y_train)
f_support = f_selector.get_support()
f_feature = X_train.loc[:,f_support].columns.tolist()

print(str(len(f_feature)), 'selected features')
print(f_feature)


2 selected features
['total_bill', 'tip']


### rfe

In [22]:
lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X_train,y_train)  
lm.fit(X_rfe,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
mask = rfe.support_
rfe_features = X_train.loc[:,mask].columns.tolist()
print(str(len(rfe_features)), 'selected features')
print(rfe_features)

2 selected features
['total_bill', 'tip']


- f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

I'm not too sure, maybe it has to do with how recursive feature selection tests everything. It definitly changes, but they still produce different outcomes

### 2.) Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

### 3.) Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

### 4.) Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).