<a href="https://colab.research.google.com/github/kirstyabhus/Python-with-Machine-Learning/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from google.colab import files
uploaded = files.upload()

Saving train.csv to train (1).csv


In [None]:
dataset = pd.read_csv('train.csv')
# shows we have 891 rows and 12 columns

In [None]:
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
# Find the amount of Nan values:
dataset.isna().sum()
# here this is telling us that:
  # the AGE column has 177 missing values out of 891 rows
  # the CABIN column has 687 missing values out of 891 rows
  # the EMBARKED column has 2 missing values out of 891 rows

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
# delete the following columns, Cabin bc too many NaN, the others because they are not important for our question (we don't need them)
del dataset["Cabin"]
del dataset["PassengerId"]
del dataset["Name"]
del dataset["Ticket"]

In [None]:
# biggest and smallest AGE value
dataset["Age"].max()

80.0

In [None]:
dataset["Age"].min()
# we have a very wide range of ages, so mean might give us a very random number. so we're using median so we can get the value bang in the middle. This is our reason for using median, but you can pick between any of the statistical methods (mean mode median)

0.42

In [None]:
# calculate the Median of the Age column
dataset["Age"].median()

28.0

In [None]:
# fill the empty (NaN) values of Age with the calculated median of the Age column. -> inplace=True means you want to overwrite the data (not creating a copy of the data set (=False would make a copy of the data set) -> = True theres no going back and the dataset is changed)
dataset["Age"].fillna(dataset["Age"].median(), inplace=True)
#

In [None]:
# After altering the NaN values, now checking the sum of the NaN values
dataset.isna().sum()
# can see there are no longer NaN values in the Age column (tey've been filled with the median of the Age column)

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [None]:
# embarked (the town that people embarked from) we have CATEGORICAL data (either S, C or Q)
# we have only 2 missing values
# we'll find out the town that MOST people embarked from (the mode)

dataset["Embarked"].mode()[0]
# we put the 0 to let it know that we want the FIRST value (the mode value)
# this is telling us that the most common town that people embarked from is is the town "S"

'S'

In [None]:
# we are going to assume that for the 2 missing data points in the Embarked column, they embarked from the town "S". So we will fill these NaN values in for them
dataset["Embarked"].fillna(dataset["Embarked"].mode()[0], inplace=True)

In [None]:
# After altering the NaN values, now checking the sum of the NaN values
dataset.isna().sum()
# there are no longer any NaN values for the Embarked column

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [None]:
dataset

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,28.0,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


Next session - encoding

In [None]:
# Using ONE-HOT ENCODING - We will encode the SEX, EMBARKED and PCLASS column.
# Encoding Sex will give us 2 extra columns - one for male and one for female
# Encoding Embarked will give us 3 extra columns - one for S, one for C, one for Q
# Encoding Pclass will give 3 extra columns - one for 1, one for 2, one for 3.
# So this will give us 8 NEW COLUMNS in total after, -> and we will delete the Sex, Embarked and Pclass column.

In [None]:
# Create new ONE-HOT ENCODED columns
# the new columns added are called DUMMIES
dummies = []
# the names of the columns we are encoding
cols = ["Pclass", "Sex", "Embarked"]

# iterate through each of the column names
for col in cols:
  # get_dummies() function converts a categorical variable into dummy/indicator variables. i.e. it turns a single categorical column into many indicator columns.
  # in our dataset, it will look at the current column then convert that categorical data into many columns.
  dummies.append(pd.get_dummies(dataset[col]))

# to merge/concatenate the multiple pandas DataFrames
all_dummies = pd.concat(dummies, axis=1)
all_dummies



Unnamed: 0,1,2,3,female,male,C,Q,S
0,0,0,1,0,1,0,0,1
1,1,0,0,1,0,1,0,0
2,0,0,1,1,0,0,0,1
3,1,0,0,1,0,0,0,1
4,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...
886,0,1,0,0,1,0,0,1
887,1,0,0,1,0,0,0,1
888,0,0,1,1,0,0,0,1
889,1,0,0,0,1,1,0,0


In [None]:
# Add the dummies to the dataset and remove the old columns (as we no longer need these columns since we encoded them)
# pd.concat() axis=1 because we want to concatenate along the columns.
dataset = pd.concat((dataset,all_dummies), axis=1)
dataset = dataset.drop(["Pclass", "Sex", "Embarked"], axis=1)

In [None]:
dataset

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,1,2,3,female,male,C,Q,S
0,0,22.0,1,0,7.2500,0,0,1,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,1,26.0,0,0,7.9250,0,0,1,1,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,1,0,0,0,1
4,0,35.0,0,0,8.0500,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,0,1,0,0,1,0,0,1
887,1,19.0,0,0,30.0000,1,0,0,1,0,0,0,1
888,0,28.0,1,2,23.4500,0,0,1,1,0,0,0,1
889,1,26.0,0,0,30.0000,1,0,0,0,1,1,0,0
