# Importing Libraries:

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

### Importing the dataset:

In [70]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")

### Exploring the dataset:

In [71]:
train.head()

In [72]:
train.shape

In [73]:
train.describe()

In [74]:
train.info()

### Exploratory Data Analysis:

In [75]:
sns.set_theme(style = 'whitegrid')
sns.countplot(train['Survived'])

In [76]:
sns.countplot(train['Pclass'])

In [77]:
sns.countplot(train['Sex'])

In [78]:
sns.countplot(train['SibSp'])

In [79]:
sns.countplot(train['Parch'])

In [80]:
sns.countplot(train['Embarked'])

### Feature Engineering:

In [81]:
# Dropping useless columns such as PassengerId, Name & Ticket
# Dropping the Cabin Column because it contains very high number of null values
# Filling the null values in the Age column with it's mean
# Filling the null values in the Embarked column with it's mode
# Changing the values of Age column where age is less than 1
# Based on counts, I have combined 'Parch' & 'SibSp'

train = train.drop(['PassengerId','Cabin','Name','Ticket'], axis = 1)
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
train['Age'] = np.where(train['Age'] < 1, 1, train['Age'])
train['Parch'] = np.where(train['Parch'] > 2, '3+', train['Parch'])
train['SibSp'] = np.where(train['SibSp'] > 1, '2+', train['SibSp'])

In [82]:
# Replaing Categorical variables to numbers
train.Sex.replace('male', 0, inplace = True)
train.Sex.replace('female', 1, inplace = True)

train.Embarked.replace('S', 0, inplace = True)
train.Embarked.replace('C', 1, inplace = True)
train.Embarked.replace('Q', 2, inplace = True)

### Applying the same transformation to test dataset:

In [83]:
test = test.drop(['PassengerId','Cabin','Name','Ticket'], axis = 1)
test['Age'] = test['Age'].fillna(test['Age'].mean())

#Here one value in Fare was missing.
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

test['Embarked'] = test['Embarked'].fillna(test['Embarked'].mode()[0])
test['Age'] = np.where(test['Age'] < 1, 1, test['Age'])
test['Parch'] = np.where(test['Parch'] > 2, '3+', test['Parch'])
test['SibSp'] = np.where(test['SibSp'] > 1, '2+', test['SibSp'])

test.Sex.replace('male', 0, inplace = True)
test.Sex.replace('female', 1, inplace = True)

test.Embarked.replace('S', 0, inplace = True)
test.Embarked.replace('C', 1, inplace = True)
test.Embarked.replace('Q', 2, inplace = True)

### Random Forest Classification:

In [84]:
y = train["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
x = pd.get_dummies(train[features])
x_test = pd.get_dummies(test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(x, y)

In [85]:
y_pred = model.predict(x_test)
y_pred