# Feature Engineering Exercises 
- https://ds.codeup.com/regression/feature-engineering/

In [1]:
import wrangle
import warnings
warnings.filterwarnings("ignore")


from pydataset import data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn.feature_selection
import sklearn.preprocessing

In [2]:
def split(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames.
    return train, validate, test DataFrames.
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123)
    return train, validate, test

**1. Load the tips dataset**

In [3]:
# import data from pydataset
from pydataset import data
tips = data('tips')

In [5]:
# make the dataset a pandas dataframe
df = pd.DataFrame(tips)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
# Rename the size column because .size is a built-in Pandas attribute
df = df.rename(columns={'size': 'number_of_people'})

In [7]:
type(df)

pandas.core.frame.DataFrame

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   number_of_people  244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


**1a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.** 

In [9]:
df.tip/df.total_bill*100

1       5.944673
2      16.054159
3      16.658734
4      13.978041
5      14.680765
         ...    
240    20.392697
241     7.358352
242     8.822232
243     9.820426
244    15.974441
Length: 244, dtype: float64

In [10]:
df['tip_percentage'] = df.tip/df.total_bill*100

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   number_of_people  244 non-null    int64  
 7   tip_percentage    244 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 17.2+ KB


In [12]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,5.944673
2,10.34,1.66,Male,No,Sun,Dinner,3,16.054159
3,21.01,3.5,Male,No,Sun,Dinner,3,16.658734
4,23.68,3.31,Male,No,Sun,Dinner,2,13.978041
5,24.59,3.61,Female,No,Sun,Dinner,4,14.680765


**1b. Create a column named price_per_person. This should be the total bill divided by the party size.**

In [14]:
df.total_bill/df.number_of_people.astype(float)

1       8.495000
2       3.446667
3       7.003333
4      11.840000
5       6.147500
         ...    
240     9.676667
241    13.590000
242    11.335000
243     8.910000
244     9.390000
Length: 244, dtype: float64

In [16]:
df['price_per_person'] = df.total_bill/df.number_of_people.astype(float)
#.astype(float)*1000

In [17]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,number_of_people,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,16.658734,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,13.978041,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,14.680765,6.1475


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   number_of_people  244 non-null    int64  
 7   tip_percentage    244 non-null    float64
 8   price_per_person  244 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 19.1+ KB


In [20]:
# For this specific exercise, we're only focusing on the numeric features
df["dinner_time"] = df.time == "Dinner"
df = df[["total_bill", "tip", "number_of_people", "tip_percentage", "price_per_person", "dinner_time"]]

In [21]:
# Split the data
train, validate, test = split(df)

In [22]:
# X and y splits
target = "tip"

X_train = train.drop(columns=[target])
y_train = train[target]

X_validate = validate.drop(columns=[target])
y_validate = validate[target]

X_test = test.drop(columns=[target])
y_test = test[target]

In [23]:
X_train.head()

Unnamed: 0,total_bill,number_of_people,tip_percentage,price_per_person,dinner_time
19,16.97,3,20.624632,5.656667,True
173,7.25,2,71.034483,3.625,True
119,12.43,2,14.481094,6.215,False
29,21.7,2,19.815668,10.85,True
238,32.83,2,3.563814,16.415,True


In [24]:
# Scale (Make the thing)
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler, (fit the thing)
scaler.fit(X_train)

# Use the scaler to transform train, validate, test (use the thing)
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

In [25]:
X_train_scaled

array([[0.30711445, 0.4       , 0.25286274, 0.15034389, 1.        ],
       [0.09235528, 0.2       , 1.        , 0.03225806, 1.        ],
       [0.20680513, 0.2       , 0.1618078 , 0.1827957 , 0.        ],
       [0.41162174, 0.2       , 0.24087288, 0.45219413, 1.        ],
       [0.65753425, 0.2       , 0.        , 0.77564661, 1.        ],
       [0.78789218, 0.6       , 0.06198426, 0.38433595, 1.        ],
       [0.44410075, 0.6       , 0.36296815, 0.15823888, 1.        ],
       [0.3804684 , 0.2       , 0.18166098, 0.41121767, 1.        ],
       [0.31794079, 0.2       , 0.16279257, 0.32897414, 1.        ],
       [0.40720283, 0.6       , 0.18845606, 0.13397268, 1.        ],
       [0.1979673 , 0.2       , 0.13198349, 0.17117117, 1.        ],
       [0.19774635, 0.2       , 0.19009056, 0.17088056, 1.        ],
       [0.34710561, 0.2       , 0.18394107, 0.36733508, 1.        ],
       [0.32744145, 0.2       , 0.11287299, 0.3414705 , 1.        ],
       [0.31816173, 0.2       , 0.

**1c. Before using any of the methods discussed in the lesson, which features do you think would be most important for 
predicting the tip amount? The tip percentage?**

- We could use features and group them into what we Hypothesize (guess) are the most import features for predicting the tip amount (tip_percentage): 
    - Features:
        - total_bill
        - time
        - size

**1d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?**

SelectKBest
- Uses an F Test to compare how well each feature predicts the target variable.

In [26]:
# Use numeric features to predict tip_amount
# Use select-K-best and RFE to select the top 2 features
k = 2

# Make the thing
kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=2)
# fit the thing
kbest.fit(X_train_scaled, y_train)
# use the thing, 
# get_support() produces an array of booleans, so we can filter out the column names that matter the most
X_train.columns[kbest.get_support()]

Index(['total_bill', 'number_of_people'], dtype='object')

In [27]:
# Now let's do RFE

# Make the thing(s)
lm = sklearn.linear_model.LinearRegression()
rfe = sklearn.feature_selection.RFE(lm, n_features_to_select=2)
# Fit the thing
rfe.fit(X_train_scaled, y_train)
# use the thing
rfe_columns = X_train.columns[rfe.support_].tolist()
rfe_columns

['total_bill', 'tip_percentage']

**2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.**

def select_kbest(X, y, k):
    # make the object
    kbest = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_regression, k=k)

    # fit the object
    kbest.fit(X, y)
    
    # use the object (.get_support() is that array of booleans to filter the list of column names)
    return X.columns[kbest.get_support()].tolist()

In [32]:
select_kbest(X_train, y_train, 2)

['total_bill', 'number_of_people']

**3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.**

In [33]:
def select_rfe(X, y, k, return_rankings=False, model=LinearRegression()):
    # Use the passed model, LinearRegression by default
    rfe = sklearn.feature_selection.RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    features = X.columns[rfe.support_].tolist()
    if return_rankings:
        rankings = pd.Series(dict(zip(X.columns, rfe.ranking_)))
        return features, rankings
    else:
        return features

In [34]:
features_to_use, feature_rankings = select_rfe(X_train, y_train, 3, return_rankings=True)

In [35]:
feature_rankings

total_bill          1
number_of_people    1
tip_percentage      1
price_per_person    3
dinner_time         2
dtype: int64

**4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).**

In [37]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [38]:
# Split the data
train, validate, test = split(swiss)

# Setup X and y
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

X_validate = validate.drop(columns='Fertility')
y_validate = validate.Fertility

X_test = test.drop(columns='Fertility')
y_test = test.Fertility

In [39]:
# Scale the data
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler
scaler.fit(X_train)

# Use the scaler to transform train, validate, test
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

# Turn everything into a dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_validate_scaled = pd.DataFrame(X_validate_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.columns)

In [40]:
select_kbest(X_train_scaled, y_train, 3)

['Examination', 'Catholic', 'Infant.Mortality']

In [41]:
features, rankings = select_rfe(X_train_scaled, y_train, 3, return_rankings=True)

In [42]:
features

['Agriculture', 'Examination', 'Infant.Mortality']

In [43]:
rankings

Agriculture         1
Examination         1
Education           3
Catholic            2
Infant.Mortality    1
dtype: int64