## Carregando os dados

In [1]:
import pandas as pd

In [2]:
# Os dados já estão separados entre treino e teste
url1 = 'https://raw.githubusercontent.com/alura-cursos/combina-classificadores/main/dados/train.csv'
url2 = 'https://raw.githubusercontent.com/alura-cursos/combina-classificadores/main/dados/test.csv'

In [3]:
train = pd.read_csv(url1)
test = pd.read_csv(url2) 

In [4]:
train.head(5)

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [6]:
train.isna().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction    

In [7]:
from sklearn.preprocessing import OneHotEncoder


def pre_processing(df: pd.DataFrame):
    df = df.drop(columns=["Unnamed: 0", "id"])
    df.dropna(inplace=True)
    df.reset_index(inplace=True)

    encoder = OneHotEncoder(drop="if_binary")
    categorical_columns = [
        "Gender", "Customer Type", "Type of Travel", "Class"]

    df_categorical = df[categorical_columns]
    df_encoded = pd.DataFrame(
        encoder.fit_transform(df_categorical).toarray(),
        columns=encoder.get_feature_names_out(categorical_columns)
    )
    df_transformed = pd.concat(
        [df.drop(columns=categorical_columns), df_encoded],
        axis=1
    )

    x = df_transformed.drop(columns=["satisfaction"])
    y = df_transformed["satisfaction"]
    return x, y

In [8]:
x_train, y_train = pre_processing(train)
x_test, y_test = pre_processing(test)

In [9]:
x_train.head(5)

Unnamed: 0,index,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,0,13,460,3,4,3,1,5,3,5,...,5,5,25,18.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1,25,235,3,2,3,3,1,3,1,...,4,1,1,6.0,1.0,1.0,0.0,1.0,0.0,0.0
2,2,26,1142,2,2,2,2,5,5,5,...,4,5,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,3,25,562,2,5,5,5,2,2,2,...,4,2,11,9.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,61,214,3,3,3,3,4,5,5,...,3,3,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Combinando modelos

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

pipelines = {
    "Tree": Pipeline([("Tree", DecisionTreeClassifier())]),
    "Logistic": Pipeline([("Scaler", StandardScaler()), ("Logistic", LogisticRegression())]),
    "GaussianNB": Pipeline([("Scaler", StandardScaler()), ("GaussianNB", GaussianNB())])
}

voting = VotingClassifier(
    estimators=[
        ("Tree", pipelines["Tree"]), 
        ("Logistic", pipelines["Logistic"]),
        ("GaussianNB", pipelines["GaussianNB"])
    ],
    voting="hard"
)
voting

In [11]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

stratified_kfold = StratifiedKFold(shuffle=True)
cv_results = cross_val_score(voting, x_train, y_train, cv=stratified_kfold)
cv_results.mean()

0.8930825933103147

In [12]:
voting = VotingClassifier(
    estimators=[
        ("Tree", pipelines["Tree"]), 
        ("Logistic", pipelines["Logistic"]),
        ("GaussianNB", pipelines["GaussianNB"])
    ],
    voting="soft"
)
stratified_kfold = StratifiedKFold(shuffle=True)
cv_results = cross_val_score(voting, x_train, y_train, cv=stratified_kfold)
cv_results.mean()

0.9048979728057164

## Busca dos melhores parâmetros para a votação

In [13]:
from sklearn.model_selection import GridSearchCV

grid_params = {
    "voting": ["hard", "soft"],
    "weights": [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2)]
}

grid_search = GridSearchCV(voting, grid_params, n_jobs=-1)
grid_search.fit(x_train, y_train)
grid_search.best_params_

{'voting': 'soft', 'weights': (2, 1, 1)}

In [14]:
grid_search.best_score_

0.93827828134413

In [15]:
from sklearn.metrics import classification_report

y_pred = grid_search.predict(x_test)
print(classification_report(y_test, y_pred))

                         precision    recall  f1-score   support

neutral or dissatisfied       0.95      0.95      0.95     14528
              satisfied       0.93      0.94      0.94     11365

               accuracy                           0.94     25893
              macro avg       0.94      0.94      0.94     25893
           weighted avg       0.94      0.94      0.94     25893



## Implementação do Bagging

In [16]:
from sklearn.ensemble import BaggingClassifier

model = DecisionTreeClassifier(random_state=0)
bagging = BaggingClassifier(model, n_estimators=10, random_state=0)
bagging.fit(x_train, y_train)
y_pred = bagging.predict(x_test)
print(classification_report(y_test, y_pred))

                         precision    recall  f1-score   support

neutral or dissatisfied       0.95      0.98      0.96     14528
              satisfied       0.97      0.94      0.95     11365

               accuracy                           0.96     25893
              macro avg       0.96      0.96      0.96     25893
           weighted avg       0.96      0.96      0.96     25893



## Bagging com GridSearchCV

In [17]:
grid_params = {
    'n_estimators': [10, 20, 30],
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9],
}

base_model = DecisionTreeClassifier(random_state=0)
bagging_model = BaggingClassifier(base_model, n_estimators=10, random_state=0)
bagging_grid = GridSearchCV(
    bagging_model,
    grid_params,
    cv=stratified_kfold,
    n_jobs=-1
)

bagging_grid.fit(x_train, y_train)

In [18]:
best_params = bagging_grid.best_params_
best_params

{'max_features': 0.9, 'max_samples': 0.9, 'n_estimators': 30}

In [19]:
bagging_model = BaggingClassifier(base_model, **best_params)
bagging_model.fit(x_train, y_train)
y_pred = bagging_model.predict(x_test)
print(classification_report(y_test, y_pred))

                         precision    recall  f1-score   support

neutral or dissatisfied       0.95      0.98      0.97     14528
              satisfied       0.97      0.94      0.96     11365

               accuracy                           0.96     25893
              macro avg       0.96      0.96      0.96     25893
           weighted avg       0.96      0.96      0.96     25893



## Extra Trees Classifier

In [21]:
from sklearn.ensemble import ExtraTreesClassifier

grid_params = {
    'n_estimators': [10, 20, 30],
    'max_features': [0.5, 0.7, 0.9],
}

extratrees_grid = GridSearchCV(
    ExtraTreesClassifier(),
    grid_params,
    cv=StratifiedKFold(shuffle=True),
    n_jobs=-1
)
extratrees_grid.fit(x_train, y_train)

In [22]:
best_params = extratrees_grid.best_params_

extratress_classifier = ExtraTreesClassifier(**best_params)
extratress_classifier.fit(x_train, y_train)
y_pred = extratress_classifier.predict(x_test)
print(classification_report(y_test, y_pred))

                         precision    recall  f1-score   support

neutral or dissatisfied       0.96      0.98      0.97     14528
              satisfied       0.97      0.95      0.96     11365

               accuracy                           0.96     25893
              macro avg       0.96      0.96      0.96     25893
           weighted avg       0.96      0.96      0.96     25893



In [23]:
extratrees_grid.best_score_

0.9633376373825684

## Implementando AdaBoostClassifier

In [25]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=50, learning_rate=1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))



                         precision    recall  f1-score   support

neutral or dissatisfied       0.95      0.95      0.95     14528
              satisfied       0.94      0.94      0.94     11365

               accuracy                           0.95     25893
              macro avg       0.94      0.94      0.94     25893
           weighted avg       0.95      0.95      0.95     25893



## GridSearch com AdaBoostClassifier

In [26]:
grid_params = {
    'n_estimators': [50,100,200],
    'learning_rate': [0.1,0.01,0.001]
}

ada_boost_search = GridSearchCV(
    AdaBoostClassifier(),
    grid_params,
    cv=StratifiedKFold(shuffle=True),
    n_jobs=-1
)
ada_boost_search.fit(x_train, y_train)



In [27]:
y_pred = ada_boost_search.predict(x_test)
print(classification_report(y_test, y_pred))

                         precision    recall  f1-score   support

neutral or dissatisfied       0.92      0.94      0.93     14528
              satisfied       0.93      0.90      0.91     11365

               accuracy                           0.92     25893
              macro avg       0.92      0.92      0.92     25893
           weighted avg       0.92      0.92      0.92     25893



In [28]:
ada_boost_search.best_score_

0.9255651835110648