In [87]:
#Loading all Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [88]:
#Loading the Data
Train_data = pd.read_csv("./Data/train.csv")
Train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Data Analysis

In [89]:
Train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


- The info about the data shows some null values in the "Age" , "Cabin" , "Embarked" column

In [90]:
Train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [91]:
print(Train_data.shape) # checking the dimension of the dataframe
print("Number of Rows ->",Train_data.shape[0])
print("Number of Columns ->",Train_data.shape[1])

(891, 12)
Number of Rows -> 891
Number of Columns -> 12


In [92]:
# Checking the total number of Null or Nan values in each Column
Train_data.isnull().sum() 

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

- There are 3 Features "Age" , "Cabin" and "Embarked" which have Nan values

In [93]:
print("Count of Null Values in Age feature : " ,Train_data["Age"].isnull().sum())
print("Count of Null Values in Cabin feature : " ,Train_data["Cabin"].isnull().sum())
print("Count of Null Values in Embarked feature : " ,Train_data["Embarked"].isnull().sum())

Count of Null Values in Age feature :  177
Count of Null Values in Cabin feature :  687
Count of Null Values in Embarked feature :  2


In [94]:
pd.unique(Train_data["Sex"]) # checking the unique values of feature "Sex"

array(['male', 'female'], dtype=object)

In [95]:
Train_data[Train_data["Age"].isnull()==True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [96]:
#finding the mean or average of non-null values in the "Age" featuer
non_null_count = (Train_data["Age"].isnull() == False).sum()
avg_age = (Train_data["Age"][Train_data["Age"].isnull() == False].sum())/non_null_count
print(avg_age)

29.69911764705882


In [97]:
#finding the unique values in the "Cabin" feature
display(pd.unique(Train_data["Cabin"]))
print("Total unique values in the cabin feature" ,len(pd.unique(Train_data["Cabin"])))

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

Total unique values in the cabin feature 148


In [98]:
pd.unique(Train_data["Embarked"])#finding the unique values in the "Cabin" feature

array(['S', 'C', 'Q', nan], dtype=object)

In [99]:
# Counting the Occurence of each of the values ['S', 'C', 'Q', nan] in the "Embarked" feature
print("Occurence of the value:'S' in the Embarked feature :", (Train_data["Embarked"] == 'S').sum())
print("Occurence of the value:'C' in the Embarked feature :", (Train_data["Embarked"] == 'C').sum())
print("Occurence of the value:'Q' in the Embarked feature :", (Train_data["Embarked"] == 'Q').sum())
print("Occurence of the value:'nan' in the Embarked feature :", (Train_data["Embarked"].isna()).sum())



Occurence of the value:'S' in the Embarked feature : 644
Occurence of the value:'C' in the Embarked feature : 168
Occurence of the value:'Q' in the Embarked feature : 77
Occurence of the value:'nan' in the Embarked feature : 2


- The "Cabin" feature is not required ,as it contains a lot of different values and a huge number of Null values , so dropping "Cabin" feature.
- The "Embarked" feature might be required as it has only 3 unique values and 2 Null values , if not needed we will drop it later.
- The "Age" feature might be an effecting feature for the target "Survived" , so we should try using it.
- The "Age" feature has 177 Null(NaN) values , which can be managed with average values.
- The "Sex" feature might also be an effecting feature for the target "Survived" , so we can to use it