In [None]:
# William Gunawan, Kenji Nakachi, Andrew Fortner, Kate Zhang

<h1> Overview </h1>

We chose the Airline Passenger Satisfaction Dataset from Kaggle (https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction).\
The features are described in the next section *Column Context*.\
Our goal is to predict "Customer Satisfcation" given all of the other features. 

<h2>Column Context:</h2>

*Gender:* Gender of the passengers **(Female, Male)**

*Customer Type:* The customer type **(Loyal customer, disloyal customer)**

*Age:* The actual age of the passengers **(years)**

*Type of Travel:* Purpose of the flight of the passengers **(Personal Travel, Business Travel)**

*Class:* Travel class in the plane of the passengers **(Business, Eco, Eco Plus)**

*Flight distance:* The flight distance of this journey **(miles)**

*Inflight wifi service:* Satisfaction level **(0:Not Applicable;1-5)**

*Departure/Arrival time convenient:* Satisfaction level **(0:Not Applicable;1-5)**

*Ease of Online booking:* Satisfaction level **(0:Not Applicable;1-5)**

*Gate location:* Satisfaction level **(0:Not Applicable;1-5)**

*Food and drink:* Satisfaction level **(0:Not Applicable;1-5)**

*Online boarding:* Satisfaction level **(0:Not Applicable;1-5)**

*Seat comfort:* Satisfaction level **(0:Not Applicable;1-5)**

*Inflight entertainment:* Satisfaction level **(0:Not Applicable;1-5)**

*On-board service:* Satisfaction level **(0:Not Applicable;1-5)**

*Leg room service:* Satisfaction level **(0:Not Applicable;1-5)**

*Baggage handling:* Satisfaction level **(0:Not Applicable;1-5)**

*Check-in service:* Satisfaction level **(0:Not Applicable;1-5)**

*Inflight service:* Satisfaction level **(0:Not Applicable;1-5)**

*Cleanliness:* Satisfaction level **(0:Not Applicable;1-5)**

*Departure Delay in Minutes:* Minutes delayed when departure **(Minutes)**

*Arrival Delay in Minutes:* Minutes delayed when Arrival **(Minutes)**

*Satisfaction:* Airline satisfaction level **(Satisfaction, neutral or dissatisfaction)**

In [None]:
# Import packages
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer


In [None]:
# Download data
data = pd.read_csv("https://raw.githubusercontent.com/katezhang1234/ML_Airline_Project/main/train.csv")
data.head(5)

<h1> Step 1: Data Cleaning </h1>
<h3> 1.1: Remove Unnecessary Columns </h3>

We don't want to use features like *ID* in our classification, so we drop these unnecessary features

In [None]:
# Check for duplicated rows
print("Duplicated IDs: ",data['id'].duplicated().sum())

# Drop id and Unnamed columns -- we do not want to use these as features
data = data.drop(['Unnamed: 0', 'id'], axis=1)
data.head(5)

<h3> 1.2: Encoding Categorical Values </h3>

In order to impute missing values, we need to encode our categorical values into numerical values

In [None]:
#One hot encoding of categorical variables
categorical_variables = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']

df_encoded = pd.get_dummies(data, columns=categorical_variables)
df_encoded.drop(['Gender_Female', 'Customer Type_disloyal Customer', 'Type of Travel_Business travel', 'Class_Eco', 'Class_Eco Plus', 'satisfaction_neutral or dissatisfied'], axis=1, inplace=True)

labels = df_encoded['satisfaction_satisfied']
df_encoded.drop(['satisfaction_satisfied'], axis=1, inplace=True)
# Print the first five rows of the encoded DataFrame
df_encoded.head()

In [None]:
# Check for 0s / nulls / missing values
nan_count = df_encoded.isna().sum().sum()
print("Training Set Missing values = ", nan_count)
df_encoded[df_encoded.isna().any(axis=1)]

<h3>1.3 Split Into Test & Train Sets & Impute Missing Data</h3>

In [None]:
from sklearn.model_selection import train_test_split
# split data

X_train, X_test, y_train, y_test = train_test_split(df_encoded, labels, test_size=0.2, random_state=42)

# Impute Arrival Delay in Minutes column with mean
# We're only doing this column since this is the only column with NaN values (seen from above)
mean_value_train = X_train['Arrival Delay in Minutes'].mean()
X_train['Arrival Delay in Minutes'].fillna(value=mean_value_train, inplace=True)
nan_count_train = X_train.isna().sum().sum()
print("Training Set Missing values = ", nan_count_train)

mean_value_test = X_test['Arrival Delay in Minutes'].mean()
X_test['Arrival Delay in Minutes'].fillna(value=mean_value_test, inplace=True)
nan_count_test = X_test.isna().sum().sum()
print("Test Set Missing values = ", nan_count_test)

<h2>Step 2: Data Exploration</h2>

In [None]:
# Data Exploration
combined_train_df = X_train.join(y_train)
combined_train_df.corr()['satisfaction_satisfied']

In [None]:
import seaborn as sns
# Univariate: boxplot, histogram
sns.boxplot(x="Class_Business", y="satisfaction_satisfied", data=combined_train_df)
# Bivariate: scatter plot, correlation coefficient
# Check for class imbalance
# Look for outliers/noise

In [None]:
sns.boxplot(x="satisfaction_satisfied", y="Seat comfort", data=combined_train_df)

In [None]:
sns.boxplot(x="satisfaction_satisfied", y="Inflight entertainment", data=combined_train_df)

# We chose to keep all outliers/noise because flyers could've been satisifed/dissatisfied by the flight for other reasons than just
# the individual columns we are plotting.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Feature Engineering
# Feature transformation, selection, creation
#       Selection: sampling to reduce dataset size?
#       Creation: aggregation

# Feature scaling: normalization/standardization
scaler = StandardScaler().fit_transform(X_train)
# scaled_features_df = pd.DataFrame(scaled_data, index=leaf_data.index, columns=leaf_data.columns)
scaled_data = pd.DataFrame(scaler, columns=X_train.columns)
scaled_data.head(5)
print("Means for All Columns")
print(scaled_data.mean())
print("\nStandard Deviations for All Columns")
print(scaled_data.std())


# Deal with outliers - histogram
#       Delete, impute, binning
# Dimensionality reduction
# One-hot encoding categorical variables? This may be dependent on the ML techniques used

In [None]:
# We chose to reduce the dataset size as we have a lot of un-helpful features that don't have a strong correlation with the label
# satisfaction_satisfied.
pca = PCA(n_components=0.95, svd_solver='full')
transform = pca.fit_transform(scaled_data)
pca_data = pd.DataFrame(transform)
pca_data.head(5)

In [None]:
# Clustering ... this might also count as data exploration
# Look for natural patterns in the data
# Optional, if we have time

# K-means


In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import random
from sklearn.manifold import MDS

def scatter(data, labels, numPoints = 10000):

    numEntries = data.shape[0]
    start = random.randint(0, numEntries - numPoints)
    end = start + numPoints
    data = data.iloc[start:end, :]
    labels = labels.iloc[start:end]
    
    mds = MDS(n_components=2)
    mds_data = mds.fit_transform(data)
    plt.scatter(mds_data[:, 0], mds_data[:, 1], c=labels, s=50)
    plt.show()

scatter(pca_data, labels)

In [None]:
# Find DBScan nearest neighbors

neigh = NearestNeighbors(n_neighbors=6)
neigh.fit(pca_data)
distances,indices = neigh.kneighbors(pca_data)

sorted_distances = sorted(distances, key=lambda x:x[5])
plt.plot(sorted_distances)
plt.show()

# Hierarchical clustering

# Anomaly detection

In [None]:
from sklearn.cluster import DBSCAN
# DBScan
db_scan = DBSCAN(min_samples=4, eps=3)
dbscan_labels = db_scan.fit_predict(pca_data)

labels_series = pd.Series(dbscan_labels)
scatter(pca_data, labels_series)

In [None]:
# Modeling - Regression
# Pick certain continuous features and plot against the label?

In [None]:
# Modeling - Decision Tree

In [None]:
# Modeling - KNN

In [None]:
# Evaluation