<a href="https://colab.research.google.com/github/kavmut/Titanic-Survival-Prediction/blob/master/Titanic_Survival_Prediction_Using_Mashine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Description: This program predicts if a pasenger will survie on the titanic

In [None]:
# Import The Libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


In [None]:
# Load the data
titanic = sns.load_dataset('titanic')

# Print the first rows of the data
titanic.head(10)

In [None]:
# Count the number of rows and columns in the data set
titanic.shape

In [None]:
# Get some statistics
titanic.describe()

In [None]:
# Get a count of the number of survivors
titanic['survived'].value_counts()

In [None]:
# Visualize the count of surviviurs
sns.countplot( titanic['survived'] )

In [None]:
# Visualize the count of survivors  ['who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked']

cols = ['who', 'sex', 'pclass', 'sibsp', 'parch', 'embarked']

n_rows = 2
n_cols = 3

# The subplot grid and figure size of each graph
fig, axs = plt.subplots(n_rows, n_cols, figsize = (n_cols *3.2, n_rows * 3.2))

for r in range(0, n_rows):
  for c in range(0, n_cols):

     i = r*n_cols + c # index to go through the number of columns
     ax = axs[r][c] # Show where to position each sub plot
     sns.countplot(titanic[cols[i]], hue=titanic['survived'], ax=ax)
     ax.set_title(cols[i])
     ax.legend(title='survived', loc = 'upper right')

plt.tight_layout()



In [None]:
# Look at survival rate by sex
titanic.groupby('sex')[['survived']].mean()

In [None]:
# Look at survival rate by sex and class
titanic.pivot_table('survived', index='sex', columns='class')

In [None]:
# Look at survival rate by sex and class visually
titanic.pivot_table('survived', index='sex', columns='class').plot()

In [None]:
#Plot the survival of each class
sns.barplot(x='class', y='survived', data= titanic)

In [None]:
# Look at survival rate by sex, age and class
age = pd.cut(titanic['age'], [0,18,80] )
titanic.pivot_table('survived', ['sex', age], 'class' )

In [None]:
# Plot the prices paid of each class
plt.scatter(titanic['fare'], titanic['class'], color= 'purple', label='Passenger Paid' )
plt.ylabel('Class')
plt.xlabel('Price / Fare')
plt.title('Price of Each Class')
plt.legend()
plt.show()

In [None]:
# Count the empty Values in each column
titanic.isna().sum()

In [None]:
# Look at the all of the values in each column & get a count
for val in titanic:
  print(titanic[val].value_counts())
  print()

In [None]:
# Drop the columns
titanic = titanic.drop(['deck', 'embark_town', 'alive', 'class', 'who', 'alone', 'adult_male'], axis=1)

# Remove the rows with misssing values
titanic = titanic.dropna( subset= ['embarked', 'age'])

In [None]:
# Count the new number of rows and columns in the data set
titanic.shape

In [None]:
# Look at the data types
titanic.dtypes

In [None]:
# Print the unique values in the columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()

# Encode the sex column
titanic.iloc[:,2] = labelEncoder.fit_transform( titanic.iloc[:,2].values )

# Encode the embarked column
titanic.iloc[:,7] = labelEncoder.fit_transform( titanic.iloc[:,7].values )

In [None]:
# Print the unique values in the columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())

In [None]:
titanic.dtypes

In [None]:
# Split the data into independant 'X' and dpendant 'Y' variables
X = titanic.iloc[:, 1:8].values
Y = titanic.iloc[:, 0].values

In [None]:
# Split the dataset into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
# Scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Create a function with many maschine learning models
def models(X_train, Y_train):

  # Use Logistic Regression
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0)
  log.fit(X_train, Y_train)

  # Use KNeighbours
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
  knn.fit(X_train, Y_train)

  # Use SVC (linear kernel)
  from sklearn.svm import SVC
  svc_lin = SVC(kernel='linear', random_state = 0)
  svc_lin.fit(X_train, Y_train)

  # Use SVC (RBF kernel)
  from sklearn.svm import SVC
  svc_rbf = SVC(kernel='rbf', random_state = 0)
  svc_rbf.fit(X_train, Y_train)

  # Use GaussianNB
  from sklearn.naive_bayes import GaussianNB
  gauss = GaussianNB()
  gauss.fit(X_train, Y_train)

  # Use Decision Tree
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train, Y_train)

  # Use the RandomForestClassifier
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators=10, criterion = 'entropy', random_state = 0)
  forest.fit(X_train, Y_train)

  # Print the training accuracy for each model
  print('[0]Logistic Regression Training Accuracy: ', log.score(X_train, Y_train))
  print('[1]K Neighbors Training Accuracy: ', knn.score(X_train, Y_train))
  print('[2]SVC Linear Training Accuracy: ', svc_lin.score(X_train, Y_train))
  print('[3]SVC RBF Training Accuracy: ', svc_rbf.score(X_train, Y_train))
  print('[4]Gaussian NB Training Accuracy: ', gauss.score(X_train, Y_train))
  print('[5]Decision Tree Training Accuracy: ', tree.score(X_train, Y_train))
  print('[6]Random Forest Training Accuracy: ', forest.score(X_train, Y_train))

  return log, knn, svc_lin, svc_rbf, gauss, tree, forest



In [None]:
# Get and train all of the models
model = models(X_train, Y_train)

In [None]:
# Show the confusion matrix and accuracy for all of the models on the test data
from sklearn.metrics import confusion_matrix

for i in range( len(model) ):
  cm = confusion_matrix(Y_test, model[i].predict(X_test))

  # Extract TN, FP, FN, TP
  TN, FP, FN, TP = confusion_matrix(Y_test, model[i].predict(X_test)).ravel()

  test_score = (TP + TN) / (TP + TN + FN + FP)

  print(cm)
  print('Model[{}] Testing Accuracy = "{}"'.format(i, test_score))
  print()

In [None]:
# Get feature importance
forest = model[6]
importances = pd.DataFrame({'feature': titanic.iloc[:, 1:8].columns, 'importance': np.round(forest.feature_importances_, 3)} )
importances = importances.sort_values('importance', ascending= False).set_index('feature')
importances

In [None]:
# Visualize the importance
importances.plot.bar()

In [None]:
# Print the prediction of the forest classifier
pred = model[6].predict(X_test)
print(pred)

print()

# Print the actual values
print(Y_test)

In [None]:
# My survival
my_survival = [[3, 1, 12, 10, 8, 0, 1 ]]

# Scaling my survival
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
my_survival_scaled = sc.fit_transform(my_survival)

# Print prediction of my survival using Random Forest Classifier
pred = model[6].predict(my_survival_scaled)
print(pred)

if pred == 0:
  print('Oh no! You died')
else:
  print('Good Job! You survived! And you did not Die!')

