# Pandas Coding Questions on Seaborn's Titanic Dataset

In [1]:
import pandas as pd

### Question 1: Import the pandas library and load the Titanic dataset from Seaborn into a DataFrame. Display the first 5 rows of the DataFrame.

In [16]:
import seaborn as sns

# Load the Titanic dataset
titanic = sns.load_dataset('titanic')

# Display the first 5 rows of the DataFrame
print(titanic.head()) # important
titanic.head()

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Question 2: How many passengers are there in each class?

In [3]:
titanic.groupby('class')['class'].count()

class
First     216
Second    184
Third     491
Name: class, dtype: int64

In [4]:
titanic['class'].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

### Question 3: What is the average age of passengers who survived?

In [5]:
titanic[titanic['survived'] == 1]['age'].mean()

28.343689655172415

### Question 4: Display the survival count of passengers based on gender and class.

In [10]:
titanic.groupby(['class', 'sex'])['survived'].count()

class   sex   
First   female    91
        male      45
Second  female    70
        male      17
Third   female    72
        male      47
Name: survived, dtype: int64

In [12]:
titanic.groupby(['sex', 'class'])['survived'].sum()

sex     class 
female  First     91
        Second    70
        Third     72
male    First     45
        Second    17
        Third     47
Name: survived, dtype: int64

### Question 5: What is the average fare paid by passengers in each class?

In [13]:
titanic.groupby(['class'])['fare'].mean()

class
First     84.154687
Second    20.662183
Third     13.675550
Name: fare, dtype: float64

### Question 6: Create a new column 'child' that indicates whether the passenger is a child (age < 18).

In [23]:
titanic['child'] = titanic['age'] < 18
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,child
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,False


### Question 7: What is the survival rate for children versus adults?

In [37]:
titanic.groupby('child')['survived'].mean()

child
False    0.361183
True     0.539823
Name: survived, dtype: float64

### Question 8: How many passengers embarked from each port?

In [40]:
titanic['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [41]:
titanic['embark_town'].value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

### Question 9: Find the number of missing values in each column.

In [44]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
child            0
dtype: int64

### Question 10: Display the age distribution of passengers.

In [46]:
titanic['age'].describe() # review

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

### Question 11: Calculate the survival rate based on the fare quartiles.

In [52]:
# use pd.qcut to divide the 'fare' column into quartiles
# quartiles is the original data divelided into 4 equal space block
fare_survival = titanic[['fare', 'survived']].copy()
fare_survival['fare_quartiles'] = pd.qcut(titanic['fare'], q=4)
fare_survival.groupby('fare_quartiles')['survived'].mean()

fare_quartiles
(-0.001, 7.91]     0.197309
(7.91, 14.454]     0.303571
(14.454, 31.0]     0.454955
(31.0, 512.329]    0.581081
Name: survived, dtype: float64

### Question 12: Does the cabin type affect the survival rate? Assume the first letter of the cabin indicates the cabin type.

In [53]:
# .str: The .str accessor is used to treat the column as a series of strings,
# allowing you to perform string operations on it.
titanic['carbin_type'] = titanic['cabin'].str[0]
titanic.groupby('cabin_type')['survived'].mean()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,child
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,False
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,False


In [54]:
titanic.groupby('embarked')['survived'].mean()

embarked
C    0.553571
Q    0.389610
S    0.336957
Name: survived, dtype: float64

### Question 13: Find the average age of passengers, grouped by survival and class.

In [56]:
titanic.groupby(['survived', 'class'])['age'].mean()

survived  class 
0         First     43.695312
          Second    33.544444
          Third     26.555556
1         First     35.368197
          Second    25.901566
          Third     20.646118
Name: age, dtype: float64

### Question 14: Calculate the number of surviving and non-surviving passengers, grouped by gender and class.

In [60]:
titanic.groupby(['sex', 'class', 'survived']).size() # review

sex     class   survived
female  First   0             3
                1            91
        Second  0             6
                1            70
        Third   0            72
                1            72
male    First   0            77
                1            45
        Second  0            91
                1            17
        Third   0           300
                1            47
dtype: int64

### Question 15: Identify the most common embarkation port among the survivors.

In [75]:
titanic[titanic['survived'] == 1]['embark_town'].mode()[0] # review

'Southampton'

### Question 16: Determine the proportion of passengers by gender.

In [81]:
titanic['sex'].value_counts(normalize=True) # review

male      0.647587
female    0.352413
Name: sex, dtype: float64

In [82]:
titanic['sex'].value_counts() / len(titanic)

male      0.647587
female    0.352413
Name: sex, dtype: float64

### Question 17: Calculate the average fare and age, grouped by survival and gender.

In [84]:
titanic.groupby(['survived', 'sex'])['fare', 'age'].mean() # easier

  titanic.groupby(['survived', 'sex'])['fare', 'age'].mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,fare,age
survived,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
0,female,23.024385,25.046875
0,male,21.960993,31.618056
1,female,51.938573,28.847716
1,male,40.821484,27.276022


In [85]:
titanic.groupby(['survived', 'sex']).agg({'fare': 'mean', 'age': 'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,age
survived,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
0,female,23.024385,25.046875
0,male,21.960993,31.618056
1,female,51.938573,28.847716
1,male,40.821484,27.276022


### Question 18: Identify the passenger with the highest fare.

In [87]:
titanic[titanic['fare'] == titanic['fare'].max()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,child
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True,False
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False,False
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True,False


### Question 19: Calculate the total number of family members onboard for each passenger and create a new column 'family_size'.

In [88]:
# Pclass (Passenger Class):
# SibSp (Siblings/Spouses Aboard)
# "Parch" stands for "Parents/Children Aboard."
titanic['family_size'] = titanic['sibsp'] + titanic['parch']
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,child,family_size
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,False,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,False,1
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,False,0
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,False,1
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,False,0
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,False,0
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,False,3
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,False,0


### Question 20: Find the survival rate of passengers based on the number of family members onboard.

In [89]:
titanic.groupby('family_size')['survived'].mean()

family_size
0     0.303538
1     0.552795
2     0.578431
3     0.724138
4     0.200000
5     0.136364
6     0.333333
7     0.000000
10    0.000000
Name: survived, dtype: float64

In [91]:
# Question 20: Survival rate of passengers with family vs. alone
# titanic['alone'] = titanic['family_size'] == 0
titanic.groupby('alone')['survived'].mean()

alone
False    0.505650
True     0.303538
Name: survived, dtype: float64