# ML Feature Engineering Automation

## Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Load the Tips Dataset

In [2]:
df=sns.load_dataset("tips")

In [3]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


## Split the Data into dependent and Independent features

In [4]:
x=df.drop("time",axis=1)

In [5]:
y=df["time"]

## Split the Data into Training and Testing

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

## Divide features into categorical and numerical

In [8]:
cat_fea=["sex","smoker","day"]
num_fea=["total_bill","tip","size"]

## Import required Libraies for FE

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
cat_pipe=Pipeline(
    
    steps=[
        
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("encoder",OneHotEncoder())
    ]
)

num_pipe=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",MinMaxScaler())
    ]
)

In [11]:
fe=ColumnTransformer([
    ("cat_pipe",cat_pipe,cat_fea),
    ("num_pipe",num_pipe,num_fea)
])

In [12]:
x_train=fe.fit_transform(x_train)

In [13]:
x_test=fe.fit_transform(x_test)

## Import Models

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [15]:
models=[LogisticRegression,DecisionTreeClassifier,SVC,RandomForestClassifier]

## Build and Test the Models

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
d={}
for i in models:
    model=i()
    model.fit(x_train,y_train)
    y_preds=model.predict(x_test)
    acc=accuracy_score(y_test,y_preds)
    i=str(i).split(".")[-1]
    i=i[:-2]
    d[i]=round(acc,2)

In [18]:
d

{'LogisticRegression': 0.98,
 'DecisionTreeClassifier': 0.97,
 'SVC': 0.98,
 'RandomForestClassifier': 0.97}

## Hyperparameter Tuning of RandomForestClassifier

In [19]:
grid={
    "n_estimators":[100,200,300,400,500],
    "criterion":("gini","entropy","log_loss"),
    "max_features":("sqrt","auto",None)
}

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
model=RandomForestClassifier()

In [22]:
clf=GridSearchCV(model,param_grid=grid,cv=5)

In [23]:
clf.fit(x_train,y_train)

In [24]:
clf.best_params_

{'criterion': 'gini', 'max_features': None, 'n_estimators': 300}

## Build a model with these Params

In [26]:
model=RandomForestClassifier(criterion="gini",n_estimators=400,max_features=None)

In [27]:
model.fit(x_train,y_train)

In [28]:
y_preds=model.predict(x_test)

In [29]:
accuracy_score(y_test,y_preds)

0.9672131147540983