# Data Filtering

## Titanic Dataset

In [164]:
import pandas as pd
import numpy as np
import time

In [165]:
df = pd.read_csv("../data/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [166]:
# checking the details of the dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [167]:
# checking the data types of the different fields/columns

df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Checking missing values

In [168]:
# checking if each cell is Null or not.
# it is True if it is Null and it is False if it is not null.

df.isna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [169]:
# True means 1 and False means 0
# if sum all the True values, it is basically the number of Null records
# the following command shows us how many missing values (Null) exist in the dataframe

df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [170]:
# it describes the dataframe, reporting mean, std, min, max, etc.
# it only reports numerical columns

df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Handling Missing Records

In [171]:
# removing records with the missing values

df = df.dropna(subset=["Embarked"])
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [172]:
df.shape

(889, 12)

In [173]:
# removing/dropping a column entirely

df.drop(columns=["Cabin"], inplace=True)
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64

In [174]:
# getting the average age

average_age = df["Age"].mean()
average_age

29.64209269662921

In [175]:
# replacing the missing ages with previous non-null value
# this is similar to random filling if the data is not sorted by Age

df["Age"].fillna(method="ffill", inplace=True)
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [176]:
df.shape

(889, 11)

In [177]:
# getting a new describe after handling the missing values
# now we can compare if the mean and std of the columns with missing values are changed

df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,446.0,0.382452,2.311586,29.535624,0.524184,0.382452,32.096681
std,256.998173,0.48626,0.8347,14.527483,1.103705,0.806761,49.697504
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,224.0,0.0,2.0,20.0,0.0,0.0,7.8958
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.0,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Saving Clean Dataset

In [178]:
df.to_csv("../data/titanic-clean.csv", index=False)

## DUMMY

In [179]:
df_dummy = df[["Survived", "Pclass", "Age", "Sex", "SibSp", "Fare"]]
df_dummy.head()

Unnamed: 0,Survived,Pclass,Age,Sex,SibSp,Fare
0,0,3,22.0,male,1,7.25
1,1,1,38.0,female,1,71.2833
2,1,3,26.0,female,0,7.925
3,1,1,35.0,female,1,53.1
4,0,3,35.0,male,0,8.05


In [180]:
df_dummy.dtypes

Survived      int64
Pclass        int64
Age         float64
Sex          object
SibSp         int64
Fare        float64
dtype: object

In [181]:
df_dummy = df_dummy.astype({"Pclass": "category",
                "Sex": "category"})
df_dummy.dtypes

Survived       int64
Pclass      category
Age          float64
Sex         category
SibSp          int64
Fare         float64
dtype: object

In [182]:
df_dummy = pd.get_dummies(df_dummy)
df_dummy.head()

Unnamed: 0,Survived,Age,SibSp,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,0,22.0,1,7.25,0,0,1,0,1
1,1,38.0,1,71.2833,1,0,0,1,0
2,1,26.0,0,7.925,0,0,1,1,0
3,1,35.0,1,53.1,1,0,0,1,0
4,0,35.0,0,8.05,0,0,1,0,1


## Reading Cells

In [183]:
df.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            881, 882, 883, 884, 885, 886, 887, 888, 889, 890],
           dtype='int64', length=889)

In [184]:
df.loc[890]

PassengerId                    891
Survived                         0
Pclass                           3
Name           Dooley, Mr. Patrick
Sex                           male
Age                           32.0
SibSp                            0
Parch                            0
Ticket                      370376
Fare                          7.75
Embarked                         Q
Name: 890, dtype: object

In [185]:
df.iloc[-1]

PassengerId                    891
Survived                         0
Pclass                           3
Name           Dooley, Mr. Patrick
Sex                           male
Age                           32.0
SibSp                            0
Parch                            0
Ticket                      370376
Fare                          7.75
Embarked                         Q
Name: 890, dtype: object

In [186]:
index_list = df.index
for i in index_list:
    df.loc[0]
    
for i in range(10):
    df.iloc[i]
    

In [187]:
# reading a cell
# row 2, column: Fare

fare = df.loc[2, "Fare"]
fare

7.925

In [188]:
# updating a cell
# row 2, column: Fare

df.loc[2, "Fare"] = 10

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


## Rows Iteration

In [189]:
df["NewFare"] = 0
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,NewFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0


In [190]:
for index, row in df.iterrows():
    if row["Pclass"] == 1:
        df.loc[index, "NewFare"] = row["Fare"] * 2
    else:
        df.loc[index, "NewFare"] = row["Fare"] * 0.5
    
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,NewFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,3.625
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,142.5666
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S,5.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,106.2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,4.025


## Updating a column data

In [191]:
df["NewFare"] = 0
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,NewFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0


In [192]:
df[df["Pclass"] == 1]["NewFare"] = df[df["Pclass"] == 1]["Fare"] * 2
df[df["Pclass"] != 1]["NewFare"] = df[df["Pclass"] != 1]["Fare"] * 0.5
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df["Pclass"] == 1]["NewFare"] = df[df["Pclass"] == 1]["Fare"] * 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df["Pclass"] != 1]["NewFare"] = df[df["Pclass"] != 1]["Fare"] * 0.5


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,NewFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0


In [193]:
df["NewFare"] = np.where(df["Pclass"] == 1, df["Fare"] * 2, df["Fare"] * 0.5)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,NewFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,3.625
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,142.5666
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S,5.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,106.2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,4.025


In [194]:
df["NewFare"] = np.where(df["Pclass"] == 1, df["Fare"] * 2, df["NewFare"])
df["NewFare"] = np.where(df["Pclass"] == 2, df["Fare"] * 0.5, df["NewFare"])
df["NewFare"] = np.where(df["Pclass"] == 3, df["Fare"] * 0.5, df["NewFare"])

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,NewFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,3.625
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,142.5666
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S,5.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,106.2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,4.025


In [203]:
# Create a new column called 'Gender'
# If Sex = 'Male' Then Gender = 1
# Else Gender = 0

t0 = time.time()

df["Gender"] = 0

for index, row in df.iterrows():
    if row["Sex"] == 'male':
        df.loc[index, "Gender"] = 1
    else:
        df.loc[index, "Gender"] = 0

t1 = time.time()
print(f"It took {t1 - t0:.5f} seconds.")

df.head()

It took 0.21736 seconds.


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,NewFare,Gender
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,3.625,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,142.5666,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S,5.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,106.2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,4.025,1


In [209]:
t0 = time.time()

df["Gender"] = np.where(df["Sex"] == 'male', 1, 0)

t1 = time.time()
print(f"It took {t1 - t0:.5f} seconds.")

df.head()

It took 0.00101 seconds.


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,NewFare,Gender
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,3.625,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,142.5666,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,10.0,S,5.0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,106.2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,4.025,1


## Age Cleaning

In [211]:
# there are two methods to get the quantiles of a column in a dataframe

q1_1 = np.quantile(df["Age"], 0.25)
q1_2 = df["Age"].quantile(0.25)

print(q1_1)
print(q1_2)

20.0
20.0


## IQR - Outlier Removal

In [213]:
q1 = np.quantile(df["Age"], 0.25)
q3 = np.quantile(df["Age"], 0.75)

iqr = q3 - q1
lower = q1 - (1.5 * iqr)
upper = q3 + (1.5 * iqr)

print(f"Lower Bound: {lower}")
print(f"Upper Bound: {upper}")

Lower Bound: -7.0
Upper Bound: 65.0


In [214]:
df.shape

(889, 13)

In [216]:
# removing the ages below or over a certain threshold

df = df[(df["Age"] <= upper) & (df["Age"] >= lower)]
df.shape

(881, 13)

## Getting Unique Values

In [226]:
df["Sex"].unique()

array(['male', 'female'], dtype=object)