# Linear vs Decision Tree

In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df=pd.read_csv('Airplane Services.csv')

In [4]:
df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,Male,disloyal Customer,48,Business travel,Business,821,3,3,3,...,5,3,2,5,4,5,5,2,5.0,neutral or dissatisfied
1,2,Female,Loyal Customer,35,Business travel,Business,821,2,2,2,...,5,5,5,5,3,5,5,26,39.0,satisfied
2,3,Male,Loyal Customer,41,Business travel,Business,853,4,4,4,...,3,3,3,3,4,3,5,0,0.0,satisfied
3,4,Male,Loyal Customer,50,Business travel,Business,1905,2,2,2,...,5,5,5,5,3,5,4,0,0.0,satisfied
4,5,Female,Loyal Customer,49,Business travel,Business,3470,3,3,3,...,3,3,4,3,3,3,5,0,1.0,satisfied


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 50000 non-null  int64  
 1   Gender                             50000 non-null  object 
 2   Customer Type                      50000 non-null  object 
 3   Age                                50000 non-null  int64  
 4   Type of Travel                     50000 non-null  object 
 5   Class                              50000 non-null  object 
 6   Flight Distance                    50000 non-null  int64  
 7   Inflight wifi service              50000 non-null  int64  
 8   Departure/Arrival time convenient  50000 non-null  int64  
 9   Ease of Online booking             50000 non-null  int64  
 10  Gate location                      50000 non-null  int64  
 11  Food and drink                     50000 non-null  int

In [7]:
df.fillna(df["Arrival Delay in Minutes"].mode()[0], inplace=True)

In [8]:
df.isnull().sum()

id                                   0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64

ENCODING

In [9]:
cardinality=df.nunique()

In [10]:
cardinality

id                                   50000
Gender                                   2
Customer Type                            2
Age                                     75
Type of Travel                           2
Class                                    3
Flight Distance                       3462
Inflight wifi service                    6
Departure/Arrival time convenient        6
Ease of Online booking                   6
Gate location                            5
Food and drink                           6
Online boarding                          6
Seat comfort                             6
Inflight entertainment                   6
On-board service                         6
Leg room service                         6
Baggage handling                         5
Checkin service                          6
Inflight service                         6
Cleanliness                              6
Departure Delay in Minutes             394
Arrival Delay in Minutes               403
satisfactio

In [11]:
for col in df.columns:
    if df[col].dtype=='object':
        if cardinality[col]<=5:
            df=pd.get_dummies(df,columns=[col],dtype=int,drop_first=True)
        else:
            le=LabelEncoder()
            df[col]=le.fit_transform(df[col])


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 50000 non-null  int64  
 1   Age                                50000 non-null  int64  
 2   Flight Distance                    50000 non-null  int64  
 3   Inflight wifi service              50000 non-null  int64  
 4   Departure/Arrival time convenient  50000 non-null  int64  
 5   Ease of Online booking             50000 non-null  int64  
 6   Gate location                      50000 non-null  int64  
 7   Food and drink                     50000 non-null  int64  
 8   Online boarding                    50000 non-null  int64  
 9   Seat comfort                       50000 non-null  int64  
 10  Inflight entertainment             50000 non-null  int64  
 11  On-board service                   50000 non-null  int

In [13]:
num_col=df.select_dtypes(include=['int64','int32']).columns

In [14]:
num_col

Index(['id', 'Age', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Gender_Male',
       'Customer Type_disloyal Customer', 'Type of Travel_Personal Travel',
       'Class_Eco', 'Class_Eco Plus', 'satisfaction_satisfied'],
      dtype='object')

SCALING

In [15]:
scaler=StandardScaler()
df[num_col]=scaler.fit_transform(df[num_col])

In [16]:
df.head()

Unnamed: 0,id,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,-1.531676,0.555757,-0.353414,0.208014,-0.039917,0.171882,0.015807,1.353673,-0.187497,1.181918,...,1.144312,1.30482,-0.320342,5.0,1.009001,2.117607,-0.667211,-0.886261,-0.273972,-0.874878
1,-1.531641,-0.302937,-0.353414,-0.544461,-0.69515,-0.537493,-0.76364,-0.150859,1.292005,0.424463,...,1.144312,1.30482,0.25334,39.0,-0.99108,-0.472231,-0.667211,-0.886261,-0.273972,1.143016
2,-1.531605,0.093384,-0.321954,0.96049,0.615316,0.881257,0.795254,1.353673,1.292005,1.181918,...,-0.562954,1.30482,-0.368149,0.0,1.009001,-0.472231,-0.667211,-0.886261,-0.273972,1.143016
3,-1.531569,0.687864,0.712321,-0.544461,-0.69515,-0.537493,-0.76364,0.601407,0.552254,1.181918,...,1.144312,0.543992,-0.368149,0.0,1.009001,-0.472231,-0.667211,-0.886261,-0.273972,1.143016
4,-1.531534,0.621811,2.250953,0.208014,-0.039917,0.171882,0.015807,0.601407,1.292005,0.424463,...,-0.562954,1.30482,-0.368149,1.0,-0.99108,-0.472231,-0.667211,-0.886261,-0.273972,1.143016


In [17]:
x=df.drop(columns=['satisfaction_satisfied'])

In [18]:
y=df['satisfaction_satisfied']

In [19]:
x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.2,random_state=42)
x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

In [20]:
x_train.shape

(40000, 24)

In [21]:
y_train.shape

(40000,)

In [22]:
x_test.shape

(5000, 24)

In [23]:
y_test.shape

(5000,)

In [24]:
x_val.shape

(5000, 24)

In [25]:
y_val.shape

(5000,)

In [26]:
#Model Selection
model=LinearRegression()

In [27]:
linear_model=model.fit(x_train,y_train)

In [28]:
linear_model

In [29]:
y_pred=linear_model.predict(x_test)

In [30]:
y_pred

array([-0.15314972, -0.18260451, -0.92279661, ...,  0.50310077,
       -0.86506587,  1.0022026 ], shape=(5000,))

In [31]:
y

0       -0.874878
1        1.143016
2        1.143016
3        1.143016
4        1.143016
           ...   
49995    1.143016
49996   -0.874878
49997   -0.874878
49998   -0.874878
49999   -0.874878
Name: satisfaction_satisfied, Length: 50000, dtype: float64

In [32]:
mse_linear=mean_squared_error(y_test,y_pred)
r2_linear=r2_score(y_test,y_pred)

In [33]:
print(mse_linear)
print(r2_linear)

0.4242832870453284
0.5757258927344959


DECISION TREE

In [34]:
model=DecisionTreeRegressor()

In [35]:
model

In [36]:
dt_model=model.fit(x_train,y_train)

In [37]:
dt_model

In [38]:
y_pred=dt_model.predict(x_test)

In [39]:
y_pred

array([-0.8748783 , -0.8748783 , -0.8748783 , ..., -0.8748783 ,
       -0.8748783 ,  1.14301611], shape=(5000,))

In [40]:
y[:5]

0   -0.874878
1    1.143016
2    1.143016
3    1.143016
4    1.143016
Name: satisfaction_satisfied, dtype: float64

In [41]:
mse_tree=mean_squared_error(y_test,y_pred)
r2_tree=r2_score(y_test,y_pred)

In [42]:
print(mse_tree)
print(r2_tree)

0.2410563548126467
0.7589488606750474


In [43]:
print(mse_tree)
print(r2_tree)
print(mse_linear)
print(r2_linear)

0.2410563548126467
0.7589488606750474
0.4242832870453284
0.5757258927344959


In [44]:
print(mse_tree)
print(r2_tree)
print(mse_linear)
print(r2_linear)

0.2410563548126467
0.7589488606750474
0.4242832870453284
0.5757258927344959
