In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from tabulate import tabulate

In [2]:
df=pd.read_csv('Airplane Services.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 50000 non-null  int64  
 1   Gender                             50000 non-null  object 
 2   Customer Type                      50000 non-null  object 
 3   Age                                50000 non-null  int64  
 4   Type of Travel                     50000 non-null  object 
 5   Class                              50000 non-null  object 
 6   Flight Distance                    50000 non-null  int64  
 7   Inflight wifi service              50000 non-null  int64  
 8   Departure/Arrival time convenient  50000 non-null  int64  
 9   Ease of Online booking             50000 non-null  int64  
 10  Gate location                      50000 non-null  int64  
 11  Food and drink                     50000 non-null  int

In [4]:
# Tushib qolgan qiymatlar ustida ishlash
df["Arrival Delay in Minutes"].fillna(df["Arrival Delay in Minutes"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Arrival Delay in Minutes"].fillna(df["Arrival Delay in Minutes"].median(), inplace=True)


In [5]:
df.isnull().sum()

id                                   0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64

In [6]:
# Yangi feature yaratish (Feature Engineering)
df["Total_Service_Score"] = df[["Inflight wifi service", "Food and drink", 
                                "Online boarding", "Seat comfort", 
                                "Inflight entertainment", "On-board service", 
                                "Leg room service", "Baggage handling", 
                                "Checkin service", "Inflight service", 
                                "Cleanliness"]].sum(axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 50000 non-null  int64  
 1   Gender                             50000 non-null  object 
 2   Customer Type                      50000 non-null  object 
 3   Age                                50000 non-null  int64  
 4   Type of Travel                     50000 non-null  object 
 5   Class                              50000 non-null  object 
 6   Flight Distance                    50000 non-null  int64  
 7   Inflight wifi service              50000 non-null  int64  
 8   Departure/Arrival time convenient  50000 non-null  int64  
 9   Ease of Online booking             50000 non-null  int64  
 10  Gate location                      50000 non-null  int64  
 11  Food and drink                     50000 non-null  int

In [8]:
# Katagorial qiymatlarni kodlash
df_encoded = pd.get_dummies(df.drop(columns=["id"]), drop_first=True)

In [10]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                50000 non-null  int64  
 1   Flight Distance                    50000 non-null  int64  
 2   Inflight wifi service              50000 non-null  int64  
 3   Departure/Arrival time convenient  50000 non-null  int64  
 4   Ease of Online booking             50000 non-null  int64  
 5   Gate location                      50000 non-null  int64  
 6   Food and drink                     50000 non-null  int64  
 7   Online boarding                    50000 non-null  int64  
 8   Seat comfort                       50000 non-null  int64  
 9   Inflight entertainment             50000 non-null  int64  
 10  On-board service                   50000 non-null  int64  
 11  Leg room service                   50000 non-null  int

In [12]:
# Raqamli ustunlarni ajratib olish
num_cols = df_encoded.select_dtypes(include=["int64", "float64"]).columns
num_cols

Index(['Age', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'Total_Service_Score'],
      dtype='object')

In [13]:
# Logarifmik o'zgaruvchilarga o'zgaruvchilarni qo'llash
df_encoded[num_cols] = df_encoded[num_cols].apply(lambda x: np.log1p(x))

In [14]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                50000 non-null  float64
 1   Flight Distance                    50000 non-null  float64
 2   Inflight wifi service              50000 non-null  float64
 3   Departure/Arrival time convenient  50000 non-null  float64
 4   Ease of Online booking             50000 non-null  float64
 5   Gate location                      50000 non-null  float64
 6   Food and drink                     50000 non-null  float64
 7   Online boarding                    50000 non-null  float64
 8   Seat comfort                       50000 non-null  float64
 9   Inflight entertainment             50000 non-null  float64
 10  On-board service                   50000 non-null  float64
 11  Leg room service                   50000 non-null  flo

In [15]:
# Datani Scale qilish
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

In [16]:
df_encoded.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Total_Service_Score,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,0.614819,0.042818,0.357779,0.167439,0.334689,0.179884,1.168274,0.012895,1.016176,1.061096,...,1.115781,-0.058058,0.320992,0.971617,True,True,False,False,False,False
1,-0.075064,0.042818,-0.343752,-0.438134,-0.312226,-0.626617,0.025247,1.033867,0.495489,1.061096,...,1.115781,1.281446,1.464282,1.154509,False,False,False,False,False,True
2,0.269878,0.082915,0.901928,0.637158,0.836476,0.805455,1.168274,1.033867,1.016176,-0.081973,...,1.115781,-0.72781,-0.758802,0.780594,True,False,False,False,False,True
3,0.704339,0.925813,-0.343752,-0.438134,-0.312226,-0.626617,0.6543,0.574776,1.016176,1.061096,...,0.597836,-0.72781,-0.758802,1.154509,True,False,False,False,False,True
4,0.660027,1.555166,0.357779,0.167439,0.334689,0.179884,0.6543,1.033867,0.495489,-0.081973,...,1.115781,-0.72781,-0.34108,0.477128,False,False,False,False,False,True


In [18]:
# Input va outputlarni ajratib olish
x = df_encoded.drop(columns=["satisfaction_satisfied"])
y = df_encoded["satisfaction_satisfied"]

In [19]:
# Train va test datalarini ajratib olish
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
# RandomForestClassifier modelini o'qitish
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

In [24]:
# Modelni baholash
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

In [25]:
# Metrics DataFrame yaratish
report_df = pd.DataFrame(report).transpose()

In [26]:
# Model bo'yicha natijalar
print(f"Accuracy: {accuracy:.4f}\n")
print(tabulate(report_df, headers="keys", tablefmt="pretty"))

Accuracy: 0.9607

+--------------+--------------------+--------------------+--------------------+---------+
|              |     precision      |       recall       |      f1-score      | support |
+--------------+--------------------+--------------------+--------------------+---------+
|    False     | 0.9540526587506454 | 0.9777777777777777 | 0.9657695322707082 | 5670.0  |
|     True     | 0.9699212222487468 | 0.938337182448037  | 0.953867824862073  | 4330.0  |
|   accuracy   |       0.9607       |       0.9607       |       0.9607       | 0.9607  |
|  macro avg   | 0.9619869404996961 | 0.9580574801129074 | 0.9598186785663906 | 10000.0 |
| weighted avg | 0.9609237467453233 |       0.9607       | 0.9606160929627691 | 10000.0 |
+--------------+--------------------+--------------------+--------------------+---------+
