In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%config InlineBackend.figure_format = 'retina'

In [None]:
RANDOM_SEED = 666

In [None]:
pd.set_option("display.max_columns", None)

## Airline Passenger Satisfaction

In [None]:
data = pd.read_csv("../data/airline_satisfaction/train.csv", index_col=0)
data.head()

In [None]:
# Train a simple model using only numerical features
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [None]:
ignore_features = ["id"]
features = [
    column
    for column, series in data.items()
    if np.issubdtype(series.dtype, np.number) and column not in ignore_features
]
target = "satisfaction"

In [None]:
X = data[features]
# y is 0s and 1s
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data[target])

In [None]:
X.shape

In [None]:
y

In [None]:
imputer = SimpleImputer()
X_trans = imputer.fit_transform(X)

In [None]:
model = LogisticRegression(random_state=RANDOM_SEED)
model = model.fit(X_trans, y)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
imputer = SimpleImputer()

X_trans = imputer.fit_transform(scaler.fit_transform(X))

In [None]:
model = LogisticRegression(random_state=RANDOM_SEED)
model = model.fit(X_trans, y)

In [None]:
# predict_proba to get probabilities
y_pred_proba = model.predict_proba(X_trans)

In [None]:
# The output of predict_proba is N x C
# N -> number of samples
# C -> number of classes
print(y_pred_proba.shape)

In [None]:
y_pred_proba

In [None]:
# predict gives classes.
# It assumes a default threshold of 0.5

In [None]:
y_pred = model.predict(X_trans)

In [None]:
y_pred

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
ax = ConfusionMatrixDisplay.from_predictions(y, y_pred)

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Quick view of metrics:
print(classification_report(y, y_pred))

In [None]:
# Individual metrics available too
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
for func in (precision_score, recall_score, f1_score):
    print(f"{func.__name__+':':20s} {func(y, y_pred):.4f}")

#### The threshold is a choice!

In [None]:
threshold = 0.4

In [None]:
print(classification_report(y, y_pred_proba[:, 1] >= threshold))

In [None]:
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay

In [None]:
ax = PrecisionRecallDisplay.from_predictions(y, y_pred_proba[:, 1])

In [None]:
ax = RocCurveDisplay.from_predictions(y, y_pred_proba[:, 1])

## Global Speech Accent Recognition

In [None]:
data = pd.read_csv("../data/accent_recognition/train.csv")
data.head()

In [None]:
features = [
    "X1",
    "X2",
    "X3",
    "X4",
    "X5",
    "X6",
    "X7",
    "X8",
    "X9",
    "X10",
    "X11",
    "X12",
]
target = "language"

In [None]:
X = data[features]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data[target])

In [None]:
y

In [None]:
scaler = StandardScaler()
X_trans = scaler.fit_transform(X)

In [None]:
model = LogisticRegression(random_state=RANDOM_SEED)
model.fit(X_trans, y)

In [None]:
y_pred_proba = model.predict_proba(X_trans)

In [None]:
y_pred_proba.shape

In [None]:
y_pred_proba

In [None]:
y_pred = model.predict(X_trans)

In [None]:
y_pred

In [None]:
ConfusionMatrixDisplay.from_predictions(y, y_pred)

In [None]:
label_encoder.inverse_transform([5])

In [None]:
data[target].value_counts()

In [None]:
model = LogisticRegression(class_weight="balanced", random_state=RANDOM_SEED)
model.fit(X_trans, y)

In [None]:
y_pred = model.predict(X_trans)

In [None]:
ax = ConfusionMatrixDisplay.from_predictions(y, y_pred)

In [None]:
print(classification_report(y, y_pred, target_names=label_encoder.classes_))

In [None]:
for func in (precision_score, recall_score, f1_score):
    for avg in ("micro", "macro"):
        print(f"{func.__name__+f' ({avg}):':30s} {func(y, y_pred, average=avg):.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(class_weight="balanced", random_state=RANDOM_SEED)
model.fit(X_trans, y)

In [None]:
y_pred = model.predict(X_trans)

In [None]:
ax = ConfusionMatrixDisplay.from_predictions(y, y_pred)