In [9]:
import pandas as pd

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"

df = pd.read_csv(url)
df.info()

##### Sorting

# 1. Create fare DataFrame with only the Fare column

fare = df[["Fare"]]
fare.head()


# 2. Create class_age DataFrame with Pclass and Age
class_age = df[["Pclass", "Age"]]
class_age.head()


# Create survived_gender DataFrame with Survived and Sex

survived_gender = df[["Survived", "Sex"]]
survived_gender.head()





######Subsetting


### Subsetting Rows

# 1. Filter the Titanic dataset for cases where the passenger’s fare is greater than 100, assigning it to fare gt 100.View the printed result.

fare_gt_100 = df[df["Fare"] > 100]
fare_gt_100


# 2. Filter the Titanic dataset for cases where the passenger’s class (Pclass) is 1, assigning it to first class.View the printed result.

first_class = df[df["Pclass"] == 1]
first_class


# 3. Filter the Titanic dataset for cases where the passenger’s age is less than 18 and the passenger is female (Sex is "female"), assigning it to female under 18. View the printed result.

female_under_18 = df[(df["Age"] < 18) & (df["Sex"] == "female")]
female_under_18


## Subsetting Rows by Categorical variable:

# 1. Filter the Titanic dataset for passengers whose Embarked port is either "C" (Cherbourg) or "S" (Southampton), assigning the result to embarked c or s. View the printed result.

embarked_c_or_s = df[df["Embarked"].isin(["C", "S"])]
embarked_c_or_s


# 2. Filter the Titanic dataset for passengers whose Pclass is in the list [1, 2] (indicating first or second class), assigning the result to first second class.View the printed result.

first_second_class = df[df["Pclass"].isin([1, 2])]
first_second_class





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [24]:
# 3.2 Exploratory Data Analysis Practice Exercise - 1.

# 1. Which passenger had the highest fare paid relative to their age?

df["Age"] = df["Age"].fillna(df["Age"].median())

df["fare_per_year"] = df["Fare"] / df["Age"]


high_fare_age = df[df["fare_per_year"] > 5]


high_fare_age_srt = high_fare_age.sort_values(by="fare_per_year", ascending=False)

result = high_fare_age_srt[["Name", "fare_per_year"]]
print(result.head())





#2 Which adult male passenger (age ≥ 18 and Sex is ’male’) paid the highest fare relative to their class?

df["fare_per_class"] = df["Fare"] / df["Pclass"]

adult_males = df[(df["Sex"] == "male") & (df["Age"] >= 18)]

adult_males_srt = adult_males.sort_values(by="fare_per_class", ascending=False)

result = adult_males_srt[["Name", "Age", "fare_per_class"]]
result.head()



                                Name  fare_per_year
305   Allison, Master. Hudson Trevor     164.728261
297     Allison, Miss. Helen Loraine      75.775000
386  Goodwin, Master. Sidney Leonard      46.900000
164     Panula, Master. Eino Viljami      39.687500
183        Becker, Master. Richard F      39.000000


Unnamed: 0,Name,Age,fare_per_class
737,"Lesurer, Mr. Gustave J",35.0,512.3292
679,"Cardeza, Mr. Thomas Drake Martinez",36.0,512.3292
27,"Fortune, Mr. Charles Alexander",19.0,263.0
438,"Fortune, Mr. Mark",64.0,263.0
118,"Baxter, Mr. Quigg Edmond",24.0,247.5208


In [19]:
# 3.3 Exploratory Data Analysis with Group-by Method Practice Exercise:

# 1. What percent of the total fare revenue came from each passenger class?

total_fare = df["Fare"].sum()

fare_1 = df[df["Pclass"] == 1]["Fare"].sum()
fare_2 = df[df["Pclass"] == 2]["Fare"].sum()
fare_3 = df[df["Pclass"] == 3]["Fare"].sum()

fare_list = [fare_1, fare_2, fare_3]

revenue_percent = [(x / total_fare) * 100 for x in fare_list]
print(revenue_percent)



# What percent of the total number of passengers on the Titanic belonged to each age group (e.g., child, adult, senior)?

def age_group_func(age):
    if age < 18:
        return "child"
    elif age < 65:
        return "adult"
    else:
        return "senior"

df["age_group"] = df["Age"].apply(age_group_func)

total_passengers = len(df)

group_counts = df["age_group"].value_counts()

group_percent = (group_counts / total_passengers) * 100
print(group_percent)





[np.float64(63.349287718996564), np.float64(13.24962855496507), np.float64(23.401083726038365)]
age_group
adult     86.083053
child     12.682379
senior     1.234568
Name: count, dtype: float64
