In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df=pd.read_csv('Airplane Services.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 50000 non-null  int64  
 1   Gender                             50000 non-null  object 
 2   Customer Type                      50000 non-null  object 
 3   Age                                50000 non-null  int64  
 4   Type of Travel                     50000 non-null  object 
 5   Class                              50000 non-null  object 
 6   Flight Distance                    50000 non-null  int64  
 7   Inflight wifi service              50000 non-null  int64  
 8   Departure/Arrival time convenient  50000 non-null  int64  
 9   Ease of Online booking             50000 non-null  int64  
 10  Gate location                      50000 non-null  int64  
 11  Food and drink                     50000 non-null  int

In [4]:
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,Male,disloyal Customer,48,Business travel,Business,821,3,3,3,...,5,3,2,5,4,5,5,2,5.0,neutral or dissatisfied
1,2,Female,Loyal Customer,35,Business travel,Business,821,2,2,2,...,5,5,5,5,3,5,5,26,39.0,satisfied
2,3,Male,Loyal Customer,41,Business travel,Business,853,4,4,4,...,3,3,3,3,4,3,5,0,0.0,satisfied
3,4,Male,Loyal Customer,50,Business travel,Business,1905,2,2,2,...,5,5,5,5,3,5,4,0,0.0,satisfied
4,5,Female,Loyal Customer,49,Business travel,Business,3470,3,3,3,...,3,3,4,3,3,3,5,0,1.0,satisfied


In [5]:
drop_columns=['id']
for col in drop_columns:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             50000 non-null  object 
 1   Customer Type                      50000 non-null  object 
 2   Age                                50000 non-null  int64  
 3   Type of Travel                     50000 non-null  object 
 4   Class                              50000 non-null  object 
 5   Flight Distance                    50000 non-null  int64  
 6   Inflight wifi service              50000 non-null  int64  
 7   Departure/Arrival time convenient  50000 non-null  int64  
 8   Ease of Online booking             50000 non-null  int64  
 9   Gate location                      50000 non-null  int64  
 10  Food and drink                     50000 non-null  int64  
 11  Online boarding                    50000 non-null  int

In [7]:
num_col=df.select_dtypes(include=['float64', 'int64']).columns
for col in num_col:
    df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [8]:
cat_col=df.select_dtypes(include=['object']).columns
for col in cat_col:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             50000 non-null  object 
 1   Customer Type                      50000 non-null  object 
 2   Age                                50000 non-null  int64  
 3   Type of Travel                     50000 non-null  object 
 4   Class                              50000 non-null  object 
 5   Flight Distance                    50000 non-null  int64  
 6   Inflight wifi service              50000 non-null  int64  
 7   Departure/Arrival time convenient  50000 non-null  int64  
 8   Ease of Online booking             50000 non-null  int64  
 9   Gate location                      50000 non-null  int64  
 10  Food and drink                     50000 non-null  int64  
 11  Online boarding                    50000 non-null  int

In [10]:
encoder=LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col]=encoder.fit_transform(df[col])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             50000 non-null  int64  
 1   Customer Type                      50000 non-null  int64  
 2   Age                                50000 non-null  int64  
 3   Type of Travel                     50000 non-null  int64  
 4   Class                              50000 non-null  int64  
 5   Flight Distance                    50000 non-null  int64  
 6   Inflight wifi service              50000 non-null  int64  
 7   Departure/Arrival time convenient  50000 non-null  int64  
 8   Ease of Online booking             50000 non-null  int64  
 9   Gate location                      50000 non-null  int64  
 10  Food and drink                     50000 non-null  int64  
 11  Online boarding                    50000 non-null  int

In [12]:
scaler=StandardScaler()
df_scaled=scaler.fit_transform(df)
df=pd.DataFrame(df_scaled, columns=df.columns)

In [13]:
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1.009001,2.117607,0.555757,-0.667211,-0.936102,-0.353414,0.208014,-0.039917,0.171882,0.015807,...,1.2287,-0.309599,-1.034724,1.151969,0.539061,1.144312,1.30482,-0.320342,-0.263931,-0.874878
1,-0.99108,-0.472231,-0.302937,-0.667211,-0.936102,-0.353414,-0.544461,-0.69515,-0.537493,-0.76364,...,1.2287,1.249467,1.244102,1.151969,-0.255194,1.144312,1.30482,0.25334,0.539164,1.143016
2,1.009001,-0.472231,0.093384,-0.667211,-0.936102,-0.321954,0.96049,0.615316,0.881257,0.795254,...,-0.268236,-0.309599,-0.275115,-0.544523,0.539061,-0.562954,1.30482,-0.368149,-0.382034,1.143016
3,1.009001,-0.472231,0.687864,-0.667211,-0.936102,0.712321,-0.544461,-0.69515,-0.537493,-0.76364,...,1.2287,1.249467,1.244102,1.151969,-0.255194,1.144312,0.543992,-0.368149,-0.382034,1.143016
4,-0.99108,-0.472231,0.621811,-0.667211,-0.936102,2.250953,0.208014,-0.039917,0.171882,0.015807,...,-0.268236,-0.309599,0.484494,-0.544523,-0.255194,-0.562954,1.30482,-0.368149,-0.358413,1.143016


In [14]:
if 'satisfaction' in df.columns:
    x=df.drop(columns=['satisfaction'])
    y=df['satisfaction']

In [15]:
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [16]:
#1
model=RandomForestRegressor(n_estimators=100, random_state=42)
rf_model=model.fit(x_train, y_train)

In [17]:
y_pred=rf_model.predict(x_test)
mse=mean_squared_error(y_test, y_pred)
r2=r2_score(y_test, y_pred)

In [18]:
print(f'MSE: {mse}')
print(f'R2 Score: {r2}')

MSE: 0.11253886944139598
R2 Score: 0.8874635654460359


In [19]:
results=[]
results.append({'Run': 'Natija 1', 'MSE': mse, 'R2 Score': r2})

In [20]:
#2
model=RandomForestRegressor(n_estimators=200, random_state=42)
rf_model=model.fit(x_train, y_train)

In [21]:
y_pred = rf_model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [22]:
results.append({'Run': 'Natija 2', 'MSE': mse, 'R2 Score': r2})

In [23]:
#3
model=RandomForestRegressor(n_estimators=300, random_state=42)
rf_model=model.fit(x_train, y_train)

In [24]:
y_pred=rf_model.predict(x_test)
mse=mean_squared_error(y_test, y_pred)
r2=r2_score(y_test, y_pred)

In [25]:
results.append({'Run': 'Natija 3', 'MSE': mse, 'R2 Score': r2})

In [26]:
for res in results:
        print(f"{res['Run']}: MSE = {res['MSE']}, R2 Score = {res['R2 Score']}")

Natija 1: MSE = 0.11253886944139598, R2 Score = 0.8874635654460359
Natija 2: MSE = 0.11319350846442351, R2 Score = 0.8868089405867575
Natija 3: MSE = 0.1131838399691113, R2 Score = 0.8868186088728824
