# Installing and Importing Pyforest

In [1]:
# !pip install pyforest

In [1]:
import pyforest

import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import accuracy_score

# Importing train and test data set

In [2]:
# importing .csv files using Pandas
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
# Checking the top 10 rows
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [4]:
# Checking the name of the columns
train.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [5]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

# Data Cleaning

In [6]:
train.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 2)

In [8]:
# train['Embarked'] = train['Embarked'].apply(lambda x: 0 if x == 'nan' else 1 if x == 'S' else 2 if x == 'C' else 3)

In [9]:
train.drop(columns=['Name','Ticket','Cabin', 'PassengerId', 'Parch', 'Embarked'], inplace=True)

In [10]:
train.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Fare          0
dtype: int64

In [11]:
train.dropna(inplace=True)

# Checking if Matplotlib and Seaborn was correctly imported

In [12]:
# checking the age and sex
survived = 'survived'
not_survived = 'not survived'

sns.set_style('darkgrid')
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(16, 6))
women = train[train['Sex']==2]
men = train[train['Sex']==1]
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False)
ax.legend(fontsize=14)
ax.set_title('Female', fontsize=14)
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend(fontsize=14)
_ = ax.set_title('Male', fontsize=14)
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<Figure size 1600x600 with 2 Axes>

In [13]:
# Making sure there is not any NaN values
train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Fare        0
dtype: int64

In [14]:
train.replace([np.inf, -np.inf], np.nan, inplace=True)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Train Test Split

In [15]:
X = train.drop(['Survived'], axis=1)
y = train.Survived

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

<IPython.core.display.Javascript object>

# Modeling

In [17]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [18]:
# Running all the models
clf = LazyClassifier(verbose=0,ignore_warnings=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████| 30/30 [00:02<00:00, 14.39it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.81,0.79,0.79,0.81,0.02
ExtraTreeClassifier,0.79,0.78,0.78,0.79,0.02
AdaBoostClassifier,0.78,0.77,0.77,0.78,0.16
LGBMClassifier,0.79,0.77,0.77,0.78,0.44
QuadraticDiscriminantAnalysis,0.79,0.77,0.77,0.78,0.02
GaussianNB,0.78,0.77,0.77,0.78,0.02
RandomForestClassifier,0.78,0.76,0.76,0.78,0.21
BernoulliNB,0.77,0.76,0.76,0.77,0.03
KNeighborsClassifier,0.77,0.76,0.76,0.77,0.03
SGDClassifier,0.77,0.76,0.76,0.77,0.02


In [37]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

<IPython.core.display.Javascript object>

In [38]:
print('Evaluation Metrics – Random Forest:')
print('Accuracy: ' + str(metrics.accuracy_score(y_test, y_pred)))
print('F1 Score: ' + str(metrics.f1_score(y_test, y_pred, average='macro')))

Evaluation Metrics – Random Forest:
Accuracy: 0.776536312849162
F1 Score: 0.7641633728590251


In [39]:
rf = LogisticRegression()
rf.fit(X_train, y_train)
y_pred_lr = rf.predict(X_test)

<IPython.core.display.Javascript object>

In [24]:
print('Evaluation Metrics – Logistic Regression:')
print('Accuracy: ' + str(metrics.accuracy_score(y_test, y_pred_lr)))
print('F1 Score: ' + str(metrics.f1_score(y_test, y_pred_lr, average='macro')))

Evaluation Metrics – Logistic Regression:
Accuracy: 0.770949720670391
F1 Score: 0.7562198970270719
