Step 1: Loading the Data



In [None]:
import pandas as pd

# Load the training and test datasets
train_data = pd.read_csv('/kaggle/input/titanic-trial-4/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic-trial-4/test.csv')

# Display the first few rows of the training dataset
print("Training Data:")
print(train_data.head())

# Display the first few rows of the test dataset
print("\nTest Data:")
print(test_data.head())


Step 2: Data Cleaning

In [None]:
# Check for missing values
print("Missing Values in Training Data:")
print(train_data.isnull().sum())

print("\nMissing Values in Test Data:")
print(test_data.isnull().sum())

# Handle missing values for 'Age', 'Fare', 'Embarked', and 'Cabin'
# For simplicity, we'll fill missing 'Age' with the median and 'Embarked' with the mode
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)

# Since 'Cabin' has a lot of missing values, we'll drop this column
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

# Check again for missing values
print("\nMissing Values after handling:")
print(train_data.isnull().sum())
print(test_data.isnull().sum())


Step 3: Exploratory Data Analysis (EDA)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Summary statistics
print("\nSummary Statistics for Numerical Features:")
print(train_data.describe())

# Count of survivors
sns.countplot(data=train_data, x='Survived')
plt.title('Count of Survivors')
plt.show()

# Relationship between Pclass and Survival
sns.countplot(data=train_data, x='Pclass', hue='Survived')
plt.title('Survival Count by Pclass')
plt.show()

# Relationship between Sex and Survival
sns.countplot(data=train_data, x='Sex', hue='Survived')
plt.title('Survival Count by Sex')
plt.show()

# Age distribution of passengers
sns.histplot(data=train_data, x='Age', kde=True)
plt.title('Age Distribution of Passengers')
plt.xlabel('Age')
plt.show()

# Relationship between Embarked and Survival
sns.countplot(data=train_data, x='Embarked', hue='Survived')
plt.title('Survival Count by Embarked')
plt.show()
