In [1]:
# Titanic Dataset Analysis

In [2]:
import pandas as pd
import matplotlib as plt
import os

In [3]:
os.chdir('D:\Python\Kaggle')

In [4]:
## Import Machine learning and Metric Modules

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [5]:
## Read the CSV file

file_path_train = r'D:\Python\Kaggle\titanic\train.csv'
file_path_test = r'D:\Python\Kaggle\titanic\test.csv'
df=pd.read_csv(file_path_train)
df_test=pd.read_csv(file_path_test)


In [6]:
## Data Cleanup
## Handle Missing Values

df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

gender_dummies = pd.get_dummies(df['Sex'], prefix='gender')
df = pd.concat([df, gender_dummies], axis=1)

embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, embarked_dummies], axis=1)

tt_df_predictors=df.drop(['Survived','Name','PassengerId','Ticket','Cabin','Embarked','Sex'], axis=1)
tt_df_target=df['Survived']

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tt_df_predictors, tt_df_target, test_size=0.25, random_state=123)

In [8]:
# Create and fit the Logistic Regression model
logreg = LogisticRegression(max_iter=500)
#logreg.fit(X_train, y_train)
# Predict on the testing set
#y_pred = logreg.predict(X_test)
# Calculate accuracy
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy of Logistic Regression Model:", accuracy)


cv_scores_log = cross_val_score(logreg, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy scores: {cv_scores_log}")

Cross-validation accuracy scores: [0.81343284 0.7238806  0.79104478 0.79699248 0.81203008]


In [9]:
# Create and fit the Decision Tree model
tree_model = DecisionTreeClassifier()
#tree_model.fit(X_train, y_train)
# Predict on the testing set
#y_pred_tree = tree_model.predict(X_test)
# Calculate accuracy
#accuracy_tree = accuracy_score(y_test, y_pred_tree)
#print("Decision Tree Accuracy:", accuracy_tree)

cv_scores_dt = cross_val_score(tree_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy scores: {cv_scores_dt}")

Cross-validation accuracy scores: [0.85074627 0.75373134 0.79104478 0.79699248 0.7443609 ]


In [10]:
# Create and fit the Random Forest model
forest_model = RandomForestClassifier(n_estimators=100, random_state=123)
#forest_model.fit(X_train, y_train)
# Predict on the testing set
#y_pred_forest = forest_model.predict(X_test)
# Calculate accuracy
#accuracy_forest = accuracy_score(y_test, y_pred_forest)
#print("Random Forest Accuracy:", accuracy_forest)


cv_scores_forest = cross_val_score(forest_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy scores: {cv_scores_forest}")

Cross-validation accuracy scores: [0.82089552 0.73880597 0.79850746 0.81954887 0.78195489]


In [11]:
## Transform Test Dataset

df_test['Age'] = df_test['Age'].fillna(df_test['Age'].median())
df_test['Embarked'] = df_test['Embarked'].fillna(df_test['Embarked'].mode()[0])

gender_dummies = pd.get_dummies(df_test['Sex'], prefix='gender')
df_test = pd.concat([df_test, gender_dummies], axis=1)

embarked_dummies = pd.get_dummies(df_test['Embarked'], prefix='Embarked')
df_test = pd.concat([df_test, embarked_dummies], axis=1)

tt_df_test_predictors=df_test.drop(['Name','PassengerId','Ticket','Cabin','Embarked','Sex'], axis=1)


In [12]:

# 7. Train final model on entire training dataset
forest_model.fit(X_train, y_train)


In [13]:
## Test Predictions

predictions = forest_model.predict(tt_df_test_predictors)