# CS131-8L BM2 PARAGUAY 1Q2223
## GROUP MEMBERS
- CORTEZ, MARK MOISES T.
- GENETA, DANIEL M.
- GIL, CLAIRE FRANCHESKA M.
- PEPITO, ALYSSA MAE M.
- SOLEÑO, KEZIAH ANTONETTE C.
- VELASCO, SADIE CATHERINE E.

### Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

### Data Understanding

#### Collection of Initial Data

In [None]:
# Read dataset
df = pd.read_excel("satisfaction_2015.xlsx")
df = df.rename(columns={"satisfaction_v2":"satisfaction"}) # Rename satisfaction_v2 to satisfaction
df.head()

#### Data Description

In [None]:
# Getting summary statistics
df.describe()

In [None]:
# Create histograms to show distribution of variables

numeric_variables = list(df.select_dtypes(include=['int64', 'float64'])) #select the numeric variables

df[numeric_variables].describe().apply(lambda x:round(x,2)).T  #apply describe method

histograms = df[numeric_variables].hist(bins =10, 
                                              xlabelsize=10, 
                                              ylabelsize=10, 
                                              grid=False, 
                                              sharey= True, figsize = (15,15))

#### Data Exploration

##### Exploring categorical data

In [None]:
satisfaction_counts = df['satisfaction'].value_counts().rename_axis('satisfaction').reset_index(name='counts')
plt.bar(satisfaction_counts['satisfaction'].to_numpy(), satisfaction_counts['counts'].to_numpy())
plt.title("Number of passengers classified by satisfied column")
plt.xlabel('Satisfaction')
plt.ylabel('Counts')
plt.show()

In [None]:
class_satisfaction_grpby = df[["Class", "satisfaction"]]
class_satisfaction_grpby = class_satisfaction_grpby.groupby(["Class", "satisfaction"]).size().reset_index(name="counts")

business_satisfied = class_satisfaction_grpby.query("Class == 'Business' and satisfaction == 'satisfied'")['counts'].tolist()
eco_satisfied = class_satisfaction_grpby.query("Class == 'Eco' and satisfaction == 'satisfied'")['counts'].tolist()
eco_plus_satisfied = class_satisfaction_grpby.query("Class == 'Eco Plus' and satisfaction == 'satisfied'")['counts'].tolist()

business_unsatisfied = class_satisfaction_grpby.query("Class == 'Business' and satisfaction == 'neutral or dissatisfied'")['counts'].to_list()
eco_unsatisfied = class_satisfaction_grpby.query("Class == 'Eco' and satisfaction == 'neutral or dissatisfied'")['counts'].tolist()
eco_plus_unsatisfied = class_satisfaction_grpby.query("Class == 'Eco Plus' and satisfaction == 'neutral or dissatisfied'")['counts'].tolist()

x = ['Business', 'Eco', 'Eco Plus']
y1 = np.array([business_satisfied, eco_satisfied, eco_plus_satisfied]).flatten()
y2 = np.array([business_unsatisfied, eco_unsatisfied, eco_plus_unsatisfied]).flatten()

# plot bars in stack manner
plt.figure(figsize=(8,5))
plt.barh(x, y1, color='b')
plt.barh(x, y2, left=y1, color='r')
plt.xlabel("Counts")
plt.ylabel("Class")
plt.legend(["satisfied", "neutral or dissatisfied"], bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Flight class with satisfaction overlay")
plt.show()

In [None]:
business_total = np.add(business_satisfied, business_unsatisfied)
business_satis_percent = np.divide(business_satisfied, business_total)
business_unsatis_percent = np.divide(business_unsatisfied, business_total)
business_satis_percent = np.round(business_satis_percent, 3)
business_unsatis_percent = np.round(business_unsatis_percent, 3)

eco_total = np.add(eco_satisfied, eco_unsatisfied)
eco_satis_percent = np.divide(eco_satisfied, eco_total)
eco_unsatis_percent = np.divide(eco_unsatisfied, eco_total)
eco_satis_percent = np.round(eco_satis_percent, 3)
eco_unsatis_percent = np.round(eco_unsatis_percent, 3)

eco_plus_total = np.add(eco_plus_satisfied, eco_plus_unsatisfied)
eco_plus_satis_percent = np.divide(eco_plus_satisfied, eco_plus_total)
eco_plus_unsatis_percent = np.divide(eco_plus_unsatisfied, eco_plus_total)
eco_plus_satis_percent = np.round(eco_plus_satis_percent, 3)
eco_plus_unsatis_percent = np.round(eco_plus_unsatis_percent, 3)

y1 = np.array([business_satis_percent, eco_satis_percent, eco_plus_satis_percent]).flatten()
y2 = np.array([business_unsatis_percent, eco_unsatis_percent, eco_plus_unsatis_percent]).flatten()

# plot bars in stack manner
plt.figure(figsize=(8,5))
plt.barh(x, y1, color='b')
plt.barh(x, y2, left=y1, color='r')
plt.xlabel("Counts")
plt.ylabel("Class")
plt.legend(["satisfied", "neutral or dissatisfied"], bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Flight class with satisfaction overlay (normalized)")
plt.show() # Objective 1

In [None]:
gender_satisfaction_grpby = df[["Gender", "satisfaction"]]
gender_satisfaction_grpby = gender_satisfaction_grpby.groupby(["Gender", "satisfaction"]).size().reset_index(name="counts")

male_satisfied = gender_satisfaction_grpby.query("Gender == 'Male' and satisfaction == 'satisfied'")['counts'].tolist()
female_satisfied = gender_satisfaction_grpby.query("Gender == 'Female' and satisfaction == 'satisfied'")['counts'].tolist()

male_unsatisfied = gender_satisfaction_grpby.query("Gender == 'Male' and satisfaction == 'neutral or dissatisfied'")['counts'].to_list()
female_unsatisfied = gender_satisfaction_grpby.query("Gender == 'Female' and satisfaction == 'neutral or dissatisfied'")['counts'].tolist()

x = ['Male', 'Female']
y1 = np.array([male_satisfied, female_satisfied]).flatten()
y2 = np.array([male_unsatisfied, female_unsatisfied]).flatten()

# plot bars in stack manner
plt.figure(figsize=(8,5))
plt.barh(x, y1, color='b')
plt.barh(x, y2, left=y1, color='r')
plt.xlabel("Counts")
plt.ylabel("Gender")
plt.legend(["satisfied", "neutral or dissatisfied"], bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Gender with satisfaction overlay")
plt.show()

In [None]:
male_total = np.add(male_satisfied, male_unsatisfied)
male_satis_percent = np.divide(male_satisfied, male_total)
male_unsatis_percent = np.divide(male_unsatisfied, male_total)
male_satis_percent = np.round(male_satis_percent, 3)
male_unsatis_percent = np.round(male_unsatis_percent, 3)

female_total = np.add(female_satisfied, female_unsatisfied)
female_satis_percent = np.divide(female_satisfied, female_total)
female_unsatis_percent = np.divide(female_unsatisfied, female_total)
female_satis_percent = np.round(female_satis_percent, 3)
female_unsatis_percent = np.round(female_unsatis_percent, 3)

y1 = np.array([male_satis_percent, female_satis_percent]).flatten()
y2 = np.array([male_unsatis_percent, female_unsatis_percent]).flatten()

# plot bars in stack manner
plt.figure(figsize=(8,5))
plt.barh(x, y1, color='b')
plt.barh(x, y2, left=y1, color='r')
plt.xlabel("Counts")
plt.ylabel("Gender")
plt.legend(["satisfied", "neutral or dissatisfied"], bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Gender with satisfaction overlay (normalized)")
plt.show()


In [None]:
customer_type_satisfaction_grpby = df[["Customer Type", "satisfaction"]]
customer_type_satisfaction_grpby = customer_type_satisfaction_grpby.groupby(["Customer Type", "satisfaction"]).size().reset_index(name="counts")

disloyal_satisfied = customer_type_satisfaction_grpby.query("`Customer Type` == 'disloyal Customer' and satisfaction == 'satisfied'")['counts'].tolist()
loyal_satisfied = customer_type_satisfaction_grpby.query("`Customer Type` == 'Loyal Customer' and satisfaction == 'satisfied'")['counts'].tolist()

disloyal_unsatisfied = customer_type_satisfaction_grpby.query("`Customer Type` == 'disloyal Customer' and satisfaction == 'neutral or dissatisfied'")['counts'].to_list()
loyal_unsatisfied = customer_type_satisfaction_grpby.query("`Customer Type` == 'Loyal Customer' and satisfaction == 'neutral or dissatisfied'")['counts'].tolist()

x = ['Loyal Customer', 'Disloyal Customer']
y1 = np.array([loyal_satisfied, disloyal_satisfied]).flatten()
y2 = np.array([loyal_unsatisfied, disloyal_unsatisfied]).flatten()

# plot bars in stack manner
plt.figure(figsize=(8,5))
plt.barh(x, y1, color='b')
plt.barh(x, y2, left=y1, color='r')
plt.xlabel("Counts")
plt.ylabel("Customer Type")
plt.legend(["satisfied", "neutral or dissatisfied"], bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Customer type with satisfaction overlay")
plt.show()

In [None]:
loyal_total = np.add(loyal_satisfied, loyal_unsatisfied)
loyal_satis_percent = np.divide(loyal_satisfied, loyal_total)
loyal_unsatis_percent = np.divide(loyal_unsatisfied, loyal_total)
loyal_satis_percent = np.round(loyal_satis_percent, 3)
loyal_unsatis_percent = np.round(loyal_unsatis_percent, 3)

disloyal_total = np.add(disloyal_satisfied, disloyal_unsatisfied)
disloyal_satis_percent = np.divide(disloyal_satisfied, disloyal_total)
disloyal_unsatis_percent = np.divide(disloyal_unsatisfied, disloyal_total)
disloyal_satis_percent = np.round(disloyal_satis_percent, 3)
disloyal_unsatis_percent = np.round(disloyal_unsatis_percent, 3)

y1 = np.array([loyal_satis_percent, disloyal_satis_percent]).flatten()
y2 = np.array([loyal_unsatis_percent, disloyal_unsatis_percent]).flatten()

# plot bars in stack manner
plt.figure(figsize=(8,5))
plt.barh(x, y1, color='b')
plt.barh(x, y2, left=y1, color='r')
plt.xlabel("Counts")
plt.ylabel("Customer Type")
plt.legend(["satisfied", "neutral or dissatisfied"], bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Customer type with satisfaction overlay (normalized)")
plt.show()

In [None]:
travel_type_satisfaction_grpby = df[["Type of Travel", "satisfaction"]]
travel_type_satisfaction_grpby = travel_type_satisfaction_grpby.groupby(["Type of Travel", "satisfaction"]).size().reset_index(name="counts")

personal_travel_satisfied = travel_type_satisfaction_grpby.query("`Type of Travel` == 'Personal Travel' and satisfaction == 'satisfied'")['counts'].tolist()
business_travel_satisfied = travel_type_satisfaction_grpby.query("`Type of Travel` == 'Business travel' and satisfaction == 'satisfied'")['counts'].tolist()

personal_travel_unsatisfied = travel_type_satisfaction_grpby.query("`Type of Travel` == 'Personal Travel' and satisfaction == 'neutral or dissatisfied'")['counts'].to_list()
business_travel_unsatisfied = travel_type_satisfaction_grpby.query("`Type of Travel` == 'Business travel' and satisfaction == 'neutral or dissatisfied'")['counts'].tolist()

x = ['Personal Travel', 'Business Travel']
y1 = np.array([personal_travel_satisfied, business_travel_satisfied]).flatten()
y2 = np.array([personal_travel_unsatisfied, business_travel_unsatisfied]).flatten()

# plot bars in stack manner
plt.figure(figsize=(8,5))
plt.barh(x, y1, color='b')
plt.barh(x, y2, left=y1, color='r')
plt.xlabel("Counts")
plt.ylabel("Type of Travel")
plt.legend(["satisfied", "neutral or dissatisfied"], bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Type of travel with satisfaction overlay")
plt.show()

In [None]:
personal_travel_total = np.add(personal_travel_satisfied, personal_travel_unsatisfied)
personal_travel_satis_percent = np.divide(personal_travel_satisfied, personal_travel_total)
personal_travel_unsatis_percent = np.divide(personal_travel_unsatisfied, personal_travel_total)
personal_travel_satis_percent = np.round(personal_travel_satis_percent, 3)
personal_travel_unsatis_percent = np.round(personal_travel_unsatis_percent, 3)

business_travel_total = np.add(business_travel_satisfied, business_travel_unsatisfied)
business_travel_satis_percent = np.divide(business_travel_satisfied, business_travel_total)
business_travel_unsatis_percent = np.divide(business_travel_unsatisfied, business_travel_total)
business_travel_satis_percent = np.round(business_travel_satis_percent, 3)
business_travel_unsatis_percent = np.round(business_travel_unsatis_percent, 3)

y1 = np.array([personal_travel_satis_percent, business_travel_satis_percent]).flatten()
y2 = np.array([personal_travel_unsatis_percent, business_travel_unsatis_percent]).flatten()

# plot bars in stack manner
plt.figure(figsize=(8,5))
plt.barh(x, y1, color='b')
plt.barh(x, y2, left=y1, color='r')
plt.xlabel("Counts")
plt.ylabel("Customer Type")
plt.legend(["satisfied", "neutral or dissatisfied"], bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.title("Type of travel with satisfaction overlay (normalized)")
plt.show()

##### Exploring numerical data

In [None]:
df_int = df.copy(deep=True)
df_int.drop(columns=['id', 'Age', 'Gate location', 'Flight Distance'], axis=1, inplace=True)
df_business_int = df.loc[df['Class'] == 'Business']
df_business_int.drop(columns=['id', 'Age', 'Gate location', 'Flight Distance'], axis=1, inplace=True)
df_eco_int = df.loc[df['Class'] == 'Eco']
df_eco_int.drop(columns=['id', 'Age', 'Gate location', 'Flight Distance'], axis=1, inplace=True)
df_eco_plus_int = df.loc[df['Class'] == 'Eco Plus']
df_eco_plus_int.drop(columns=['id', 'Age', 'Gate location', 'Flight Distance'], axis=1, inplace=True)

# Create column named 'satisfaction_int' where 0 = neutral/dissatisfied and 1 = satisfied
df_int['satisfaction_int'] = 0
df_int['satisfaction_int'].loc[df_int['satisfaction'] == 'satisfied'] = 1

df_business_int['satisfaction_int'] = 0
df_business_int['satisfaction_int'].loc[df_business_int['satisfaction'] == 'satisfied'] = 1

df_eco_int['satisfaction_int'] = 0
df_eco_int['satisfaction_int'].loc[df_eco_int['satisfaction'] == 'satisfied'] = 1

df_eco_plus_int['satisfaction_int'] = 0
df_eco_plus_int['satisfaction_int'].loc[df_eco_plus_int['satisfaction'] == 'satisfied'] = 1

In [None]:
# Get numeric attributes
df_numeric_features = df_int.select_dtypes(include=[np.number])
df_business_numeric_features = df_business_int.select_dtypes(include=[np.number])
df_eco_numeric_features = df_eco_int.select_dtypes(include=[np.number])
df_eco_plus_numeric_features = df_eco_plus_int.select_dtypes(include=[np.number])

In [None]:
df_correlation = df_numeric_features.corr()
print(df_correlation['satisfaction_int'].sort_values(ascending = False), '\n')

In [None]:
df_business_correlation = df_business_numeric_features.corr()
print(df_business_correlation['satisfaction_int'].sort_values(ascending = False), '\n')

In [None]:
df_eco_correlation = df_eco_numeric_features.corr()
print(df_eco_correlation['satisfaction_int'].sort_values(ascending = False), '\n')

In [None]:
df_eco_plus_correlation = df_eco_plus_numeric_features.corr()
print(df_eco_plus_correlation['satisfaction_int'].sort_values(ascending = False), '\n')

In [None]:
# Correlation heatmap
plt.figure(figsize=(21,21))
plt.title('Correlation of numeric features (All flight class)')
sns.heatmap(df_correlation, square=True, linewidths=0.8, annot=True, annot_kws={"size":10})
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(21,21))
plt.title('Correlation of numeric features (Business class)')
sns.heatmap(df_business_correlation, square=True, linewidths=0.8, annot=True, annot_kws={"size":10})
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(21,21))
plt.title('Correlation of numeric features (Eco class)')
sns.heatmap(df_eco_correlation, square=True, linewidths=0.8, annot=True, annot_kws={"size":10})
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(21,21))
plt.title('Correlation of numeric features (Eco Plus class)')
sns.heatmap(df_eco_plus_correlation, square=True, linewidths=0.8, annot=True, annot_kws={"size":10})
plt.show()

#### Data Quality Verification

In [None]:
# Prints dataframe information
df.info()

In [None]:
# Getting sum of null values in the dataframe
df.isna().sum()

### Data Preparation

#### Data Selection

In [None]:
# Remove ba natin yung Customer Type, Type of Travel, and Flight Distance?
# Remove unneeded fields (i.e., id, gender, customer type, age, type of travel, flight distance, gate location)
df.drop(columns=['id', 'Gender', 'Age', 'Gate location'], axis=1, inplace=True)
df

#### Data Cleaning

In [None]:
df.dropna(inplace=True)
df.isna().sum()

In [None]:
df['satisfaction'] = df.pop('satisfaction')
df

### Modeling

In [None]:
features = df.columns.tolist()
features = features[:-1]
features

X = df[features]
y = df['satisfaction']

x_encoded = pd.get_dummies(X, drop_first=True)

#### Decision Tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_encoded, y, test_size=0.3, stratify=y)
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

dtree = DecisionTreeClassifier(max_depth=2)
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
fig = plt.figure(figsize=((25,20)))
plot_tree(dtree,
            feature_names = x_encoded.columns,
            class_names=y, 
            impurity=False,
            proportion=True,
            filled=True)

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=dtree.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dtree.classes_)
disp.plot(cmap="Blues", values_format='')
disp.ax_.set_title("Confusion matrix (Decision Tree model)")
disp.figure_.set_figheight(7)
disp.figure_.set_figwidth(7)
plt.show()

#### Logistic regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_encoded, y, test_size=0.3, stratify=y)

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=lr.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lr.classes_)
disp.plot(cmap="Blues", values_format='')
disp.ax_.set_title("Confusion matrix (Logistic Regression model)")
disp.figure_.set_figheight(7)
disp.figure_.set_figwidth(7)
plt.show()

#### KNN

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_encoded, y, test_size=0.3, stratify=y)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn.classes_)
disp.plot(cmap="Blues", values_format='')
disp.ax_.set_title("Confusion matrix (K-Nearest Neighbor model)")
disp.figure_.set_figheight(7)
disp.figure_.set_figwidth(7)
plt.show()