In [1]:
import pandas as pd
import numpy as np
import sys
import os
import random
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

In [2]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

# Overview

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
test.info

<bound method DataFrame.info of      PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  


In [6]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


# Some Statistical

## Missing Value

In [8]:
sum_null = lambda x: x.isnull().sum()
count_null = lambda x: x.isnull().count()
percentage = lambda x: (sum_null(x) / count_null(x) * 100)

def miss_data(x):
    total_percent = pd.concat([sum_null(x), percentage(x)], axis=1, keys=["Total", "Percent"])
    types = x.dtypes.rename("Types")
    result = pd.concat([total_percent, types], axis=1)
    return result


In [9]:
miss_data(train)

Unnamed: 0,Total,Percent,Types
PassengerId,0,0.0,int64
Survived,0,0.0,int64
Pclass,0,0.0,int64
Name,0,0.0,object
Sex,0,0.0,object
Age,177,19.86532,float64
SibSp,0,0.0,int64
Parch,0,0.0,int64
Ticket,0,0.0,object
Fare,0,0.0,float64


In [10]:
miss_data(test)

Unnamed: 0,Total,Percent,Types
PassengerId,0,0.0,int64
Pclass,0,0.0,int64
Name,0,0.0,object
Sex,0,0.0,object
Age,86,20.574163,float64
SibSp,0,0.0,int64
Parch,0,0.0,int64
Ticket,0,0.0,object
Fare,1,0.239234,float64
Cabin,327,78.229665,object


## Most Frequent data

In [11]:
def most_frequent_values(data):
    total = len(data)
    total_column = pd.DataFrame(data.count(), columns=['Total'])
    most_frequent_column = data.apply(lambda col: col.value_counts().idxmax())
    frequency_column = data.apply(lambda col: col.value_counts().max())
    percent_column = np.round(frequency_column / total * 100, 3)

    result = pd.DataFrame({
        'Total': total_column['Total'],
        'Most frequent item': most_frequent_column,
        'Frequency': frequency_column,
        'Percent from total': percent_column
    })

    return result

In [12]:
most_frequent_values(train)

Unnamed: 0,Total,Most frequent item,Frequency,Percent from total
PassengerId,891,1,1,0.112
Survived,891,0,549,61.616
Pclass,891,3,491,55.107
Name,891,"Braund, Mr. Owen Harris",1,0.112
Sex,891,male,577,64.759
Age,714,24.0,30,3.367
SibSp,891,0,608,68.238
Parch,891,0,678,76.094
Ticket,891,347082,7,0.786
Fare,891,8.05,43,4.826


In [13]:
most_frequent_values(test)

Unnamed: 0,Total,Most frequent item,Frequency,Percent from total
PassengerId,418,892,1,0.239
Pclass,418,3,218,52.153
Name,418,"Kelly, Mr. James",1,0.239
Sex,418,male,266,63.636
Age,332,21.0,17,4.067
SibSp,418,0,283,67.703
Parch,418,0,324,77.512
Ticket,418,PC 17608,5,1.196
Fare,417,7.75,21,5.024
Cabin,91,B57 B59 B63 B66,3,0.718


## Unique Values

In [14]:
def unique_values(data):
    total = len(data)
    uniques = data.apply(lambda col: col.nunique())
    
    result = pd.DataFrame({
        'Total': total,
        'Uniques': uniques
    }).transpose()

    return result

In [15]:
unique_values(train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,891,891,891,891,891,891,891,891,891,891,891,891
Uniques,891,2,3,891,2,88,7,7,681,248,147,3


In [16]:
unique_values(test)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,418,418,418,418,418,418,418,418,418,418,418
Uniques,418,3,418,2,79,7,8,363,169,76,3
