In [None]:
# Import Related Packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Import trainset as train_set and testset as test_set
project_dir = '../Titanic/'
train_set = pd.read_csv(project_dir + 'train.csv')
test_set = pd.read_csv(project_dir + 'test.csv')

In [None]:
# Print first five rows of the trainset
train_set.head()

In [None]:
# Check data type of each feature
train_set.info()

In [None]:
# Check null value of each feature
train_set.isnull().sum()

In [None]:
# Check the unique values in each feature
train_set.nunique()

In [None]:
# Descriptive analysis
train_set.describe()

In [None]:
# Check if the dataset is balance or not
num_survived = train_set[train_set.Survived == 1].Survived.count()
print(f'Total number of survived passangers is {num_survived}')
print(f'Survival rate is {num_survived/len(train_set)}')

In [None]:
# Draw Correlation Matrix
correlation_matrix =train_set.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Select the features and target variable
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'
X = train_set[features]
y = train_set[target]

In [None]:
# One-hot encoding for categorical variables
X = pd.get_dummies(X, columns = ['Pclass', 'Sex', 'Embarked'])
# Fill null with mean
X.fillna(X.mean(), inplace=True)

In [None]:
# K-fold Cross-Validation using random forest
n_splits = 10
kf = KFold(n_splits = n_splits, shuffle = True, random_state = 42)
accuracies = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
average_accuracy = np.mean(accuracies)
print("Average accuracy is: ", average_accuracy)
print("Standard deviation of accuracy is: ", np.std(accuracies))

In [None]:
# Define the hyperparameters and their distributions
param_dist = {
    'n_estimators': stats.randint(100, 500),
    'max_depth': [None] + list(np.arange(10, 110, 10)),
    'min_samples_split': stats.randint(2, 11),
    'min_samples_leaf': stats.randint(1, 5)
}
# Create the classifier or model you want to tune
clf = RandomForestClassifier()
# Instantiate the RandomizedSearchCV object
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1)
# Fit the random search to your data
random_search.fit(X, y)
# Print the best hyperparameters and corresponding score
print("Best Hyperparameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

In [None]:
# Train the model with best hyperparmeters
clf = RandomForestClassifier(max_depth = random_search.best_params_['max_depth'], min_samples_leaf = random_search.best_params_['min_samples_leaf'], min_samples_split = random_search.best_params_['min_samples_split'], n_estimators = random_search.best_params_['n_estimators'])

In [None]:
# Fit the random search to the data
clf.fit(X, y)

In [None]:
# Select the features and target variable
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_test_set = test_set[features]
# One-hot encoding for categorical variables
X_test_set = pd.get_dummies(X_test_set, columns = ['Pclass', 'Sex', 'Embarked'])
# Fill null with mean
X_test_set.fillna(X.mean(), inplace=True)

In [None]:
# Predict
y_test_pred = clf.predict(X_test_set)

In [None]:
# Append the predicted results to the testset
test_set['Survived'] = y_test_pred

In [None]:
# Draw Correlation Matrix
correlation_matrix =test_set.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Calculate the survival rate of the prediction
num_survived = test_set[test_set.Survived == 1].Survived.count()
print(f'Total number of predicted survived passangers is {num_survived}')
print(f'Survival rate is {num_survived/len(test_set)}')

In [None]:
Output = test_set[['PassengerId', 'Survived']]

In [None]:
Output.to_csv(project_dir + 'Prediction.csv', index = False)