# Airline Passenger Satisfaction - Data Preparation
----
### Import data

In [7]:
import pandas as pd

dataset_path = 'airline_passenger_satisfaction.csv'
dataset = pd.read_csv(dataset_path)
dataset.describe()

Unnamed: 0.1,Unnamed: 0,age,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes
count,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129880.0,129487.0
mean,64939.5,39.427957,1190.316392,2.728696,3.057599,2.756876,2.976925,3.204774,3.252633,3.441361,3.358077,3.383023,3.350878,3.632114,3.306267,3.642193,3.286326,14.713713,15.091129
std,37493.270818,15.11936,997.452477,1.32934,1.526741,1.40174,1.27852,1.329933,1.350719,1.319289,1.334049,1.287099,1.316252,1.180025,1.266185,1.176669,1.313682,38.071126,38.46565
min,0.0,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,32469.75,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,64939.5,40.0,844.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,97409.25,51.0,1744.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,129879.0,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


### Separate input from output

In [8]:
from sklearn.preprocessing import LabelEncoder

X = dataset.drop(['Unnamed: 0'], axis=1)            # drop ID
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(X['satisfaction'])  # output
X = X.drop(['satisfaction'], axis=1)                # remove target from input

### Handle missing values (arrival_delay)
Since the arrival_delay feature is highly correlated with the departure_delay feature, and the missing values are not that many (393 out of 129879: ~0.03%), we decide to remove the column.

In [9]:
X = X.drop(['arrival_delay_in_minutes'], axis=1)

### Encode categorical data

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Get list of categorical features
s = (X.dtypes == 'object')
categorical_cols = list(s[s].index)

# Encode categorical columns (One Hot)
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))
OH_cols.index = X.index

# Replace old categorical features with their One Hot encoding
categorical = X.select_dtypes(include=['object'])
X = X.drop(categorical, axis=1)
X = pd.concat([X, OH_cols], axis=1)


### Split dataset into train and test

In [14]:
from sklearn.model_selection import train_test_split

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=45)

X_train.head()

Unnamed: 0,age,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,inflight_entertainment,...,departure_delay_in_minutes,0,1,2,3,4,5,6,7,8
108029,22,594,4,3,4,1,3,4,3,3,...,27,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
87788,52,236,4,4,4,4,2,4,4,5,...,45,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
114222,55,1121,5,1,1,1,5,5,5,5,...,105,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
26494,33,213,4,1,4,5,4,4,4,4,...,10,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
125928,47,3928,4,5,4,4,5,5,5,4,...,0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
