In [73]:
# Libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing tools
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Classifier

# Machine learning models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Metrics and model evaluation
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, classification_report, confusion_matrix
)

# Utilities for model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

# Clustering and dimensionality reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA



In [75]:
df= pd.read_csv('airline_passenger_satisfaction.csv')

In [76]:
df

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,Satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,129876,Male,28,Returning,Personal,Economy Plus,447,2,3.0,4,...,5,1,4,4,4,5,4,4,4,Neutral or Dissatisfied
129876,129877,Male,41,Returning,Personal,Economy Plus,308,0,0.0,5,...,5,2,5,2,2,4,3,2,5,Neutral or Dissatisfied
129877,129878,Male,42,Returning,Personal,Economy Plus,337,6,14.0,5,...,3,3,4,3,3,4,2,3,5,Neutral or Dissatisfied
129878,129879,Male,50,Returning,Personal,Economy Plus,337,31,22.0,4,...,4,4,5,3,3,4,5,3,5,Satisfied


In [77]:
# Droping the columns that are not needed for the analysis
df.drop(columns = ['ID','Departure and Arrival Time Convenience', 'On-board Service', 'Seat Comfort', 'Baggage Handling', 'Cleanliness', 'Leg Room Service','Check-in Service', 'Gate Location', 'Food and Drink', 'Ease of Online Booking', 'In-flight Entertainment'], inplace=True)

In [78]:
df.head()

Unnamed: 0,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Online Boarding,In-flight Service,In-flight Wifi Service,Satisfaction
0,Male,48,First-time,Business,Business,821,2,5.0,3,5,3,Neutral or Dissatisfied
1,Female,35,Returning,Business,Business,821,26,39.0,5,5,2,Satisfied
2,Male,41,Returning,Business,Business,853,0,0.0,5,3,4,Satisfied
3,Male,50,Returning,Business,Business,1905,0,0.0,4,5,2,Satisfied
4,Female,49,Returning,Business,Business,3470,0,1.0,5,3,3,Satisfied


In [79]:
data = [df]
for dataset in data:
    #Filter categorical variables
    categorical_columns = [x for x in dataset.dtypes.index if dataset.dtypes[x]=='object']
    
for col in categorical_columns:
    print ('\nFrequency of Categories for variable %s'%col)
    print (df[col].value_counts())


Frequency of Categories for variable Gender
Female    65899
Male      63981
Name: Gender, dtype: int64

Frequency of Categories for variable Customer Type
Returning     106100
First-time     23780
Name: Customer Type, dtype: int64

Frequency of Categories for variable Type of Travel
Business    89693
Personal    40187
Name: Type of Travel, dtype: int64

Frequency of Categories for variable Class
Business        62160
Economy         58309
Economy Plus     9411
Name: Class, dtype: int64

Frequency of Categories for variable Satisfaction
Neutral or Dissatisfied    73452
Satisfied                  56428
Name: Satisfaction, dtype: int64


In [80]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [81]:
df['Gender']= le.fit_transform(df['Gender'])
df['Age']= le.fit_transform(df['Age'])
df['Customer Type']= le.fit_transform(df['Customer Type'])
df['Type of Travel']= le.fit_transform(df['Type of Travel'])
df['Class']= le.fit_transform(df['Class'])
df['Flight Distance']= le.fit_transform(df['Flight Distance'])
df['Departure Delay']= le.fit_transform(df['Departure Delay'])
df['Arrival Delay']= le.fit_transform(df['Arrival Delay'])
df['Online Boarding']= le.fit_transform(df['Online Boarding'])
df['In-flight Service']= le.fit_transform(df['In-flight Service'])
df['In-flight Wifi Service']= le.fit_transform(df['In-flight Wifi Service'])
df['Satisfaction']= le.fit_transform(df['Satisfaction'])



In [82]:
X = df.drop('Satisfaction', axis=1)
Y = df['Satisfaction']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [83]:
# Initialize Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)  # using 100 trees

# Train the model
rfc.fit(X_train, y_train)

# Predict on the test data
predictions = rfc.predict(X_test)

# Check the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Random Forest Classifier Accuracy: {accuracy * 100:.2f}%")


Random Forest Classifier Accuracy: 93.81%


In [98]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Assuming encoders are already fitted to the training data
# For demonstration, let's create and fit encoders (in practice, use the pre-fitted encoders)
gender_encoder = LabelEncoder().fit(['Male', 'Female'])
purchase_encoder = LabelEncoder().fit(['First-time', 'Repeat'])
trip_encoder = LabelEncoder().fit(['Business', 'Leisure'])
class_encoder = LabelEncoder().fit(['Economy', 'Business', 'First'])

# Correcting the input data encoding
input_data = ['Male', 48, 'First-time', 'Business', 'Economy', 821, 2, 5.0, 3, 5, 3]
encoded_input_data = [
    gender_encoder.transform([input_data[0]])[0], 
    input_data[1], 
    purchase_encoder.transform([input_data[2]])[0], 
    trip_encoder.transform([input_data[3]])[0], 
    class_encoder.transform([input_data[4]])[0]
] + input_data[5:]

# Convert the list to a numpy array and reshape for a single sample prediction
input_data_as_numpy_array = np.array(encoded_input_data).reshape(1, -1)

# Load your pre-trained Random Forest Classifier
# Here, for demonstration, we initialize and train a mock model
model = RandomForestClassifier()
X_train = np.random.rand(100, 11)  # Mock training data features
Y_train = np.random.randint(2, size=100)  # Mock training data labels
model.fit(X_train, Y_train)

# Making a prediction with the preprocessed input data
prediction = model.predict(input_data_as_numpy_array)
print("Prediction:", prediction)

if prediction[0] == 0:
    print('The person is Neutral or Dissatified')
else:
    print('The person is Satisfied')


Prediction: [1]
The person is Satisfied


In [88]:
import pickle

In [89]:
filename = 'Airline Passenger_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [90]:
# loading the saved model
loaded_model = pickle.load(open('Airline Passenger_model.sav', 'rb'))

In [97]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Assuming encoders are already fitted to the training data
# For demonstration, let's create and fit encoders (in practice, use the pre-fitted encoders)
gender_encoder = LabelEncoder().fit(['Male', 'Female'])
purchase_encoder = LabelEncoder().fit(['First-time', 'Repeat'])
trip_encoder = LabelEncoder().fit(['Business', 'Leisure'])
class_encoder = LabelEncoder().fit(['Economy', 'Business', 'First'])

# Correcting the input data encoding
input_data = ['Male', 48, 'First-time', 'Business', 'Economy', 821, 2, 5.0, 3, 5, 3]
encoded_input_data = [
    gender_encoder.transform([input_data[0]])[0], 
    input_data[1], 
    purchase_encoder.transform([input_data[2]])[0], 
    trip_encoder.transform([input_data[3]])[0], 
    class_encoder.transform([input_data[4]])[0]
] + input_data[5:]

# Convert the list to a numpy array and reshape for a single sample prediction
input_data_as_numpy_array = np.array(encoded_input_data).reshape(1, -1)

# Load your pre-trained Random Forest Classifier
# Here, for demonstration, we initialize and train a mock model
model = RandomForestClassifier()
X_train = np.random.rand(100, 11)  # Mock training data features
Y_train = np.random.randint(2, size=100)  # Mock training data labels
model.fit(X_train, Y_train)

# Making a prediction with the preprocessed input data
prediction = model.predict(input_data_as_numpy_array)
print("Prediction:", prediction)

if prediction[0] == 0:
    print('The person is Neutral or Dissatisfied')
else:
    print('The person is Satisfied')


Prediction: [1]
The person is Satisfied


In [99]:
for column in X.columns:
  print(column)

Gender
Age
Customer Type
Type of Travel
Class
Flight Distance
Departure Delay
Arrival Delay
Online Boarding
In-flight Service
In-flight Wifi Service
