In [1]:
# Import packages
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import data
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
# Check summary of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
# Specify types of object columns
strings = ["PassengerId", "HomePlanet", "Destination", "CryoSleep", "VIP", "Transported"]
df[strings] = df[strings].astype("string")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   string 
 1   HomePlanet    8492 non-null   string 
 2   CryoSleep     8476 non-null   string 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   string 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   string 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   string 
dtypes: float64(6), object(2), string(6)
memory usage: 950.9+ KB


In [6]:
# Check for null values
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [7]:
# Delete rows with null values
df = df.dropna()
df.isna().sum().sum()

0

In [8]:
# Check for duplicates
df.duplicated().sum()

0

In [9]:
# Replace string values by numeric values
set(df["HomePlanet"])

{'Earth', 'Europa', 'Mars'}

In [10]:
df["HomePlanet"] = df["HomePlanet"].replace(["Earth", "Europa", "Mars"], ["1", "2", "3"])
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,2,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,1,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,2,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,2,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,1,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,2,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,1,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,1,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,2,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [11]:
set(df["Destination"])

{'55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e'}

In [12]:
df["Destination"] = df["Destination"].replace(["55 Cancri e", "PSO J318.5-22", "TRAPPIST-1e"], ["1", "2", "3"])
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,2,False,B/0/P,3,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,1,False,F/0/S,3,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,2,False,A/0/S,3,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,2,False,A/0/S,3,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,1,False,F/1/S,3,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,2,False,A/98/P,1,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,1,True,G/1499/S,2,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,1,False,G/1500/S,3,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,2,False,E/608/S,1,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [13]:
df["CryoSleep"] = df["CryoSleep"].replace(["True", "False"], ["1", "2"])
df["VIP"] = df["VIP"].replace(["True", "False"], ["1", "2"])
df["Transported"] = df["Transported"].replace(["True", "False"], ["1", "2"])

In [14]:
# Separate features from target variable
features = ["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
X = df[features]
X

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,2,2,3,39.0,2,0.0,0.0,0.0,0.0,0.0
1,1,2,3,24.0,2,109.0,9.0,25.0,549.0,44.0
2,2,2,3,58.0,1,43.0,3576.0,0.0,6715.0,49.0
3,2,2,3,33.0,2,0.0,1283.0,371.0,3329.0,193.0
4,1,2,3,16.0,2,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...
8688,2,2,1,41.0,1,0.0,6819.0,0.0,1643.0,74.0
8689,1,1,2,18.0,2,0.0,0.0,0.0,0.0,0.0
8690,1,2,3,26.0,2,0.0,0.0,1872.0,1.0,0.0
8691,2,2,1,32.0,2,0.0,1049.0,0.0,353.0,3235.0


In [15]:
X.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0
mean,28.894036,222.991674,478.958523,178.356494,313.16152,303.780048
std,14.533429,644.987936,1678.592291,576.328407,1144.016291,1127.142166
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,82.75,30.0,65.0,52.0
max,79.0,9920.0,29813.0,12253.0,22408.0,20336.0


---
**Observations:** We can see here that the different variables present different value ranges, therefore different magnitudes, as they spread over ranges of different widths.

Let's try s to rescale the features so that they’ll have the properties of a standard normal distribution with μ = 0 and σ = 1.

---

In [16]:
scaler = StandardScaler() 
X_scaled = scaler.fit_transform(X)

In [17]:
print(X_scaled.mean(axis = 0), "\n")
print(X_scaled.std(axis = 0))

[-3.87216750e-17 -3.49570677e-18 -1.93608375e-17  9.03505749e-17
  2.46312877e-16 -1.61340312e-18  3.33436646e-17 -1.93608375e-17
  2.90412562e-17  1.93608375e-17] 

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


---
**Observations:** As expected, the mean of each variable is now around zero and the standard deviation is set to 1. Thus, all the variable values lie within the same range.

---

In [18]:
print("Min values:", X_scaled.min(axis = 0), "\n")
print("Max values:", X_scaled.max(axis = 0))

Min values: [-0.83656077 -1.35379525 -1.80029905 -1.98825911 -6.30696264 -0.34575616
 -0.28535504 -0.30949367 -0.27375941 -0.26953392] 

Max values: [ 1.67141311  0.73866414  0.6330054   3.44789633  0.15855493 15.0355407
 17.47670483 20.95256346 19.31485694 17.77392146]


---
**Observations:** Unfortunately, the minimum and maximum values vary according to how spread out the variable was, and is highly influenced by the presence of outliers.

In this approach, the data is scaled to a fixed range (0 to 1). The cost of having this bounded range is that we will end up with smaller standard deviations, which can suppress the effect of outliers. Let's try instead to normalize our features with MinMax Scalar, as it is sensitive to outliers.

---

In [19]:
scaler = MinMaxScaler() 
X_scaled = scaler.fit_transform(X)

In [20]:
print("Means:", X_scaled.mean(axis = 0), "\n")
print("Std:", X_scaled.std(axis = 0))

Means: [0.3335604  0.64698759 0.73985771 0.36574729 0.97547684 0.022479
 0.01606543 0.01455615 0.01397543 0.01493804] 

Std: [0.39872823 0.47790653 0.41096378 0.18395353 0.15466666 0.06501402
 0.05629978 0.04703214 0.05105006 0.05542176]


---
**Observations:** After MinMaxScaling, the distributions are not centered at zero and the standard deviation is not 1.

---

In [21]:
print("Min:", X_scaled.min(axis = 0), "\n")
print("Max:", X_scaled.max(axis = 0))

Min: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 

Max: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


---
**Observations:** But the minimum and maximum values are standardized across variables, different from what occurs with standardization.

---

In [22]:
# Get target column
y = df["Transported"]

# Check its distribution
y.describe()

count     6606
unique       2
top          1
freq      3327
Name: Transported, dtype: object

---
**Observations:** We can see that the top value '1' (True) represents 50.4% of all values, indicating that the target variable is well-balanced since there are only two values in total.

---

In [23]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 1)

In [24]:
# Initialize 3 different classifiers
d_model = DecisionTreeClassifier(random_state = 1)
r_model = RandomForestClassifier(n_estimators = 500, max_depth = 9, random_state = 1)
lr_model = LogisticRegression()

In [25]:
# Fit models
d_model = d_model.fit(X_train, y_train)
r_model = r_model.fit(X_train, y_train)
lr_model = lr_model.fit(X_train, y_train)

In [26]:
# Test models
d_pred = d_model.predict(X_test)
r_pred = r_model.predict(X_test)
lr_pred = lr_model.predict(X_test)

In [27]:
# Get metrics
metrics = {
    "Decision Tree": {
    "Accuracy": accuracy_score(y_test, d_pred),
    "Precision": precision_score(y_test, d_pred, average = "weighted"),
    "Recall": recall_score(y_test, d_pred, average = "weighted"),
    "F1 Score": f1_score(y_test, d_pred, average = "weighted")
    },
    "Random Forest": {
    "Accuracy": accuracy_score(y_test, r_pred),
    "Precision": precision_score(y_test, r_pred, average = "weighted"),
    "Recall": recall_score(y_test, r_pred, average = "weighted"),
    "F1 Score": f1_score(y_test, r_pred, average = "weighted")
    },
    "Logistic Regression": {
    "Accuracy": accuracy_score(y_test, lr_pred),
    "Precision": precision_score(y_test, lr_pred, average = "weighted"),
    "Recall": recall_score(y_test, lr_pred, average = "weighted"),
    "F1 Score": f1_score(y_test, lr_pred, average = "weighted")
    }    
}

In [28]:
for classifier, metrics_dict in metrics.items():
    print(f"{classifier} Metrics:")
    for metric, value in metrics_dict.items():
        print(f"{metric}:{value: .3f}")
    print()

Decision Tree Metrics:
Accuracy: 0.739
Precision: 0.742
Recall: 0.739
F1 Score: 0.738

Random Forest Metrics:
Accuracy: 0.795
Precision: 0.795
Recall: 0.795
F1 Score: 0.795

Logistic Regression Metrics:
Accuracy: 0.766
Precision: 0.771
Recall: 0.766
F1 Score: 0.765



---
**Observations:** The three models have similar performances, Random Forest being the one with the highest accuracy and precision. 

Let's run a Grid Search to find out what the best parameters and score would be with this model.

---

In [29]:
grid_space = {"max_depth": [3, 5, 10, None],
              "n_estimators": [10, 100, 200],
              "max_features": [1, 3, 5, 7],
              "min_samples_leaf": [1, 2, 3],
              "min_samples_split": [1, 2, 3]
           }
grid = GridSearchCV(r_model, param_grid = grid_space, cv = 3, scoring = "accuracy")
model_grid = grid.fit(X_train, y_train)

In [30]:
print("The best hyperparameters are:", model_grid.best_params_)
print("The best score would be:", round(model_grid.best_score_, 3))

The best hyperparameters are: {'max_depth': 10, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
The best score would be: 0.802


---
**Observations:** With these hyperparameters, the model would render an accuracy score of approximately 0.80, which is a slight improvement compared to what we obtained with our first run.

Let’s use Random Search on the same data to see if we can get even better results.

---

In [31]:
rs_space = {"max_depth": list(np.arange(10, 100, step = 10)) + [None],
            "n_estimators": np.arange(10, 500, step = 50),
            "max_features": randint(1, 7),
            "criterion": ["gini", "entropy"],
            "min_samples_leaf": randint(1, 4),
            "min_samples_split": np.arange(2, 10, step = 2)
            }

In [32]:
# Run random search on the model
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(rf, rs_space, n_iter = 500, scoring = "accuracy", n_jobs = -1, cv = 3)
model_random = rf_random.fit(X_train, y_train)

In [33]:
print("The best hyperparameters are:", model_random.best_params_)
print("The best score would be:", round(model_random.best_score_, 3))

The best hyperparameters are: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 110}
The best score would be: 0.802


---
**Observations:** We can observe that both Grid Search and Random Search performed reasonably well on the dataset, as the highest accuracy of all the models built is 0.8.

However we did not manage to improve significantly the performance of our original model.

---