# Imports

In [1]:
import pandas   as pd
import numpy    as np

from matplotlib         import pyplot   as plt
from sklearn            import metrics  as mt
from sklearn.neighbors  import KNeighborsClassifier

# Load Dataset

In [2]:
#Lendo arquivo CSV de Treino
x_train=pd.read_csv('../../dataset/class/X_training.csv')
y_train=pd.read_csv('../../dataset/class/y_training.csv')

#Lendo arquivo CSV de teste
x_test = pd.read_csv('../../dataset/class/X_test.csv')
y_test = pd.read_csv('../../dataset/class/y_test.csv')

#Lendo arquivo CSV de Validação
x_val = pd.read_csv('../../dataset/class/X_validation.csv')
y_val = pd.read_csv('../../dataset/class/y_validation.csv')

In [3]:
x_train.head()

Unnamed: 0,id,customer_type,age,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,gender_Female,gender_Male,type_of_travel_business_travel,type_of_travel_personal_travel
0,13508,1,0.5,0.0,0.03958,0.6,0.6,0.6,0.6,1.0,...,0.5,1.0,0.6,0.4,0.0,0.013848,1.0,0.0,1.0,0.0
1,28874,1,0.24359,0.0,0.205775,0.6,0.4,0.4,0.4,0.6,...,0.5,0.5,0.2,0.6,0.0,0.0,0.0,1.0,1.0,0.0
2,21484,0,0.435897,1.0,0.026858,0.6,0.6,0.6,0.2,1.0,...,0.0,1.0,0.6,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,48280,1,0.589744,0.5,0.041397,0.6,1.0,0.6,0.6,0.8,...,0.0,1.0,0.4,0.4,0.029499,0.020772,1.0,0.0,0.0,1.0
4,472,0,0.423077,1.0,0.016559,0.2,0.2,0.2,0.8,0.6,...,1.0,0.75,0.8,0.6,0.021632,0.019782,0.0,1.0,1.0,0.0


In [4]:
#Seleção de Features
features = [  'customer_type', 'age', 'class', 'flight_distance',
              'inflight_wifi_service', 'departure_arrival_time_convenient',
              'ease_of_online_booking', 'gate_location', 'food_and_drink',
              'online_boarding', 'seat_comfort', 'inflight_entertainment',
              'on_board_service', 'leg_room_service', 'baggage_handling',
              'checkin_service', 'inflight_service', 'cleanliness',
              'departure_delay_in_minutes', 'arrival_delay_in_minutes',
              'gender_Female', 'gender_Male', 'type_of_travel_business_travel',
              'type_of_travel_personal_travel']

#Tratamento dos dados
x_train = x_train.loc[:,features]
y_train = y_train.values.ravel()

x_val = x_val.loc[:,features]
y_val = y_val.values.ravel()

x_test = x_test.loc[:,features]
y_test = y_test.values.ravel()




# Model Training - KNN Classifier (Training Data)

In [5]:
k = np.arange(3,11,2)
accuracy = []
precision = []
recall = []
f1 = []

for i in k:
    #Model Training
    model = KNeighborsClassifier(n_neighbors = i)
    #Fit
    model.fit(x_train,y_train)
    y_pred = model.predict(x_train)
    #performance
    accuracy.append(mt.accuracy_score(y_train,y_pred))
    precision.append(mt.precision_score(y_train,y_pred))
    recall.append(mt.recall_score(y_train,y_pred))
    f1.append(mt.f1_score(y_train,y_pred))

In [6]:
#Criando tabela com métricas para cada K
Mt=pd.DataFrame({"K":k,
                 "Accuracy":accuracy,
                 "Precision":precision,
                 "Recall":recall,
                 "F1-Score":f1})
Mt

Unnamed: 0,K,Accuracy,Precision,Recall,F1-Score
0,3,0.957016,0.973191,0.92634,0.949187
1,5,0.947583,0.970087,0.907026,0.937497
2,7,0.943019,0.968585,0.897639,0.931763
3,9,0.939557,0.967858,0.890098,0.927351


# Model Training - KNN Classifier (Validation Data)

In [7]:
#Definindo melhor valor de K
best_k = accuracy.index(max(accuracy))

In [8]:
#Model Training - Dados de validação
model = KNeighborsClassifier(n_neighbors = k[best_k])

#Fit
model.fit(x_train,y_train)
#Predict
yhat_val = model.predict(x_val)

#performance
accuracy_val = mt.accuracy_score(y_val,yhat_val)
print(f'Accuracy: {accuracy_val}')

precision_val = mt.precision_score(y_val,yhat_val)
print(f'Precision: {precision_val}')

recall_val = mt.recall_score(y_val,yhat_val)
print(f'Recall: {recall_val}')

f1_val = mt.f1_score(y_val,yhat_val)
print(f'F1-Score: {f1_val}')


Accuracy: 0.9235174876926542
Precision: 0.94254707947654
Recall: 0.8769767614522236
F1-Score: 0.9085804392138764


# Model Training - KNN Classifier (Test Data)

In [9]:
#Model Training - Dados de teste
model = KNeighborsClassifier(n_neighbors = k[best_k])

#Fit
model.fit(np.concatenate((x_train,x_val)),
          np.concatenate((y_train,y_val)))
#Predict
ypred_test = model.predict(x_test)

#performance
accuracy_test = mt.accuracy_score(y_test,ypred_test)
print(f'Accuracy: {accuracy_test}')

precision_test = mt.precision_score(y_test,ypred_test)
print(f'Precision: {precision_test}')

recall_test = mt.recall_score(y_test,ypred_test)
print(f'Recall: {recall_test}')

f1_test = mt.f1_score(y_test,ypred_test)
print(f'F1-Score: {f1_test}')



Accuracy: 0.9277024678484532
Precision: 0.9447203223086292
Recall: 0.8871975362956446
F1-Score: 0.9150558126871767


# Save Results

In [10]:
train_metrics = {
    "Algorithm": "KNN Classifier",
    "Accuracy": np.round(accuracy[best_k], 3),
    "Precision": np.round(precision[best_k], 3),
    "Recall": np.round(recall[best_k], 3),
    "F1": np.round(f1[best_k], 3)
}
validation_metrics = {
    "Algorithm": "KNN Classifier",
    "Accuracy": np.round(accuracy_val, 3),
    "Precision": np.round(precision_val, 3),
    "Recall": np.round(recall_val, 3),
    "F1": np.round(f1_val, 3)
}
test_metrics = {
    "Algorithm": "KNN Classifier",
    "Accuracy": np.round(accuracy_test, 3),
    "Precision": np.round(precision_test, 3),
    "Recall": np.round(recall_test, 3),
    "F1": np.round(f1_test, 3)
}

pd.DataFrame(train_metrics, index=[0]).to_csv(
    "./clas_train_metrics.csv", mode="a", header=False
)
pd.DataFrame(validation_metrics, index=[0]).to_csv(
    "./clas_validation_metrics.csv", mode="a", header=False
)
pd.DataFrame(test_metrics, index=[0]).to_csv(
    "./clas_test_metrics.csv", mode="a", header=False)