# Airline passenger satisfaction prediction

## Import libraries

In [75]:
import gdown 
import zipfile
from tqdm import tqdm
from dotenv import load_dotenv

import pandas as pd
import mlflow


## Upload .env variables

In [76]:
load_dotenv('.env')

False

## Connect to mlflow local server

In [77]:
experiment_name = 'Airline Satisfaction Prediction'

In [78]:
mlflow.set_tracking_uri("http://localhost:80")

In [79]:
mlflow.end_run()

In [80]:
experiment = mlflow.get_experiment_by_name(experiment_name)

In [81]:
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id

In [82]:
run = mlflow.start_run(experiment_id=experiment_id, run_name="MLP Airline")

## Download datase

In [83]:
id = "1ICvNXbJQROVfH_tiGV4m7hxNuyjpU9Pb"
dataset_dir = "./dataset/dataset-airline.zip"

!mkdir -p dataset

gdown.download(id=id, output=dataset_dir, quiet=False)
zip_ref = zipfile.ZipFile(dataset_dir, 'r')
zip_ref.extractall("./dataset/")
zip_ref.close()

Downloading...
From: https://drive.google.com/uc?id=1ICvNXbJQROVfH_tiGV4m7hxNuyjpU9Pb
To: /Users/joserodrigues/Documents/ML-DL-projects/Mlflow/first-project/src/dataset/dataset-airline.zip
100%|██████████| 2.84M/2.84M [00:00<00:00, 28.7MB/s]


## Dataset exploration

In [84]:
train_dir = "./dataset/train.csv"
test_dir = "./dataset/test.csv"
df_train = pd.read_csv(train_dir)
df_test = pd.read_csv(test_dir)

In [85]:
df_train['set'] = 'train'
df_test['set'] = 'test'

df_combined = pd.concat([df_train, df_test], ignore_index=True)

In [86]:
mlflow.log_artifact(train_dir)

S3UploadFailedError: Failed to upload ./dataset/train.csv to myminio/mlflow/1/8733805123f340268c1e835981fc27dd/artifacts/train.csv: An error occurred (NoSuchBucket) when calling the CreateMultipartUpload operation: The specified bucket does not exist

### Get dataset information

In [None]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 26 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         129880 non-null  int64  
 1   id                                 129880 non-null  int64  
 2   Gender                             129880 non-null  object 
 3   Customer Type                      129880 non-null  object 
 4   Age                                129880 non-null  int64  
 5   Type of Travel                     129880 non-null  object 
 6   Class                              129880 non-null  object 
 7   Flight Distance                    129880 non-null  int64  
 8   Inflight wifi service              129880 non-null  int64  
 9   Departure/Arrival time convenient  129880 non-null  int64  
 10  Ease of Online booking             129880 non-null  int64  
 11  Gate location                      1298

In [None]:
df_combined.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,set
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,4,3,4,4,5,5,25,18.0,neutral or dissatisfied,train
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,5,3,1,4,1,1,6.0,neutral or dissatisfied,train
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,4,3,4,4,4,5,0,0.0,satisfied,train
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,5,3,1,4,2,11,9.0,neutral or dissatisfied,train
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,4,4,3,3,3,0,0.0,satisfied,train


In [None]:
df_combined.describe()

Unnamed: 0.1,Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
count,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129487.0
mean,44158.7,64940.5,39.427957,1190.316392,2.728696,3.057599,2.756876,2.976925,3.204774,3.252633,3.441361,3.358077,3.383023,3.350878,3.632114,3.306267,3.642193,3.286326,14.713713,15.091129
std,31207.377062,37493.270818,15.11936,997.452477,1.32934,1.526741,1.40174,1.27852,1.329933,1.350719,1.319289,1.334049,1.287099,1.316252,1.180025,1.266185,1.176669,1.313682,38.071126,38.46565
min,0.0,1.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,16234.75,32470.75,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,38963.5,64940.5,40.0,844.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,71433.25,97410.25,51.0,1744.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,103903.0,129880.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


### Check if there are null values

In [None]:
df_combined.isnull().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             393
satisfaction    