<a href="https://colab.research.google.com/github/kasthurikasthurit96-svg/titanic-dataset/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

In [None]:
file_path="/content/drive/MyDrive/Titanic-Dataset.csv"

In [None]:
data=pd.read_csv(file_path)

In [None]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df = pd.read_csv(file_path)
print("Dataset Loaded Successfully!\n")

Dataset Loaded Successfully!



In [None]:
# Creating Series
age_series = df["Age"]
fare_series = df["Fare"]
survived_series = df["Survived"]

print("\nAge Series Sample:")
print(age_series.head())



Age Series Sample:
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64


In [None]:
print("\nDataset Info:")
print(df.info())

print("\nStatistical Summary:")
print(df.describe())



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None

Statistical Summary:
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.52

In [None]:
print("\nMissing Values:")
print(df.isnull().sum())

# Fill missing Age with mean
df["Age"].fillna(df["Age"].mean(), inplace=True)

# Fill missing Embarked with mode
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

# Optional: Drop rows if still null
df.dropna(inplace=True)

print("\nMissing Values After Handling:")
print(df.isnull().sum())


Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing Values After Handling:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)


In [None]:
# Using loc (modified to iloc for first row access)
print("\nPassenger Details using iloc (first row):")
print(df.iloc[0])

# Using iloc
print("\nFirst 5 rows using iloc:")
print(df.iloc[0:5])

# Extract specific columns
print("\nSelected Columns (Age, Fare, Survived):")
print(df[["Age", "Fare", "Survived"]].head())


Passenger Details using iloc (first row):
PassengerId                                                    2
Survived                                                       1
Pclass                                                         1
Name           Cumings, Mrs. John Bradley (Florence Briggs Th...
Sex                                                       female
Age                                                         38.0
SibSp                                                          1
Parch                                                          0
Ticket                                                  PC 17599
Fare                                                     71.2833
Cabin                                                        C85
Embarked                                                       C
Name: 1, dtype: object

First 5 rows using iloc:
    PassengerId  Survived  Pclass  \
1             2         1       1   
3             4         1       1   
6             7  

In [None]:
# Rename column
df.rename(columns={"Sex": "Gender"}, inplace=True)

# Pivot Table
pivot_table = df.pivot_table(values="Survived", index="Gender", columns="Pclass", aggfunc="mean")
print("\nPivot Table (Survival Rate by Gender & Class):")
print(pivot_table)


Pivot Table (Survival Rate by Gender & Class):
Pclass         1         2         3
Gender                              
female  0.962963  0.900000  0.666667
male    0.410526  0.666667  0.333333


In [None]:
# Filtering
age_above_30 = df[df["Age"] > 30]
fare_above_50 = df[df["Fare"] > 50]
female_survived = df[(df["Gender"] == "female") & (df["Survived"] == 1)]

print("\nPassengers Age > 30:")
print(age_above_30.head())

print("\nPassengers Fare > 50:")
print(fare_above_50.head())

print("\nFemale Passengers Who Survived:")
print(female_survived.head())

# Sorting
print("\nSorted by Fare:")
print(df.sort_values(by="Fare", ascending=False).head())

print("\nSorted by Age:")
print(df.sort_values(by="Age").head())

print("\nSorted by Survival Status:")
print(df.sort_values(by="Survived", ascending=False).head())


Passengers Age > 30:
    PassengerId  Survived  Pclass  \
1             2         1       1   
3             4         1       1   
6             7         0       1   
11           12         1       1   
21           22         1       2   

                                                 Name  Gender   Age  SibSp  \
1   Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
3        Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
6                             McCarthy, Mr. Timothy J    male  54.0      0   
11                           Bonnell, Miss. Elizabeth  female  58.0      0   
21                              Beesley, Mr. Lawrence    male  34.0      0   

    Parch    Ticket     Fare Cabin Embarked  
1       0  PC 17599  71.2833   C85        C  
3       0    113803  53.1000  C123        S  
6       0     17463  51.8625   E46        S  
11      0    113783  26.5500  C103        S  
21      0    248698  13.0000   D56        S  

Passenger

In [None]:
# Survival rate by Gender
print("\nSurvival Rate by Gender:")
print(df.groupby("Gender")["Survived"].mean())

# Average Age by Passenger Class
print("\nAverage Age by Passenger Class:")
print(df.groupby("Pclass")["Age"].mean())

# Survival count by Embarked location
print("\nSurvival Count by Embarked:")
print(df.groupby("Embarked")["Survived"].sum())


Survival Rate by Gender:
Gender
female    0.938144
male      0.420561
Name: Survived, dtype: float64

Average Age by Passenger Class:
Pclass
1    37.014806
2    25.543695
3    22.449853
Name: Age, dtype: float64

Survival Count by Embarked:
Embarked
C    52
Q     2
S    82
Name: Survived, dtype: int64


In [None]:
# Create additional DataFrame
class_info = pd.DataFrame({
    "Pclass": [1, 2, 3],
    "Class_Category": ["Upper", "Middle", "Lower"]
})

# Merge
merged_df = pd.merge(df, class_info, on="Pclass")
print("\nMerged DataFrame:")
print(merged_df.head())

# Join (example)
df_joined = df.join(class_info.set_index("Pclass"), on="Pclass")
print("\nJoined DataFrame:")
print(df_joined.head())

# Concat (example)
concat_df = pd.concat([df.head(), df.tail()])
print("\nConcatenated DataFrame (Head + Tail):")
print(concat_df)


Merged DataFrame:
   PassengerId  Survived  Pclass  \
0            2         1       1   
1            4         1       1   
2            7         0       1   
3           11         1       3   
4           12         1       1   

                                                Name  Gender   Age  SibSp  \
0  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
2                            McCarthy, Mr. Timothy J    male  54.0      0   
3                    Sandstrom, Miss. Marguerite Rut  female   4.0      1   
4                           Bonnell, Miss. Elizabeth  female  58.0      0   

   Parch    Ticket     Fare Cabin Embarked Class_Category  
0      0  PC 17599  71.2833   C85        C          Upper  
1      0    113803  53.1000  C123        S          Upper  
2      0     17463  51.8625   E46        S          Upper  
3      1   PP 9549  16.7000    G6        S          Lower  
4

In [None]:
# Average age of survivors vs non-survivors
print("\nAverage Age (Survivors vs Non-Survivors):")
print(df.groupby("Survived")["Age"].mean())

# Highest and lowest fare
print("\nHighest Fare Paid:", df["Fare"].max())
print("Lowest Fare Paid:", df["Fare"].min())

# Survival Percentage
survival_percentage = (df["Survived"].mean()) * 100
print("\nOverall Survival Percentage: {:.2f}%".format(survival_percentage))

print("\nProject Completed Successfully!")


Average Age (Survivors vs Non-Survivors):
Survived
0    39.979308
1    32.897870
Name: Age, dtype: float64

Highest Fare Paid: 512.3292
Lowest Fare Paid: 0.0

Overall Survival Percentage: 66.67%

Project Completed Successfully!
