Individual Assignment - S2197634

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Missing values per column:\n{df.isnull().sum()}")

# Calculate survival rate
survival_rate = df['Survived'].mean() * 100
print(f"\nOverall survival rate: {survival_rate:.2f}%")

# Analyze survival by gender
survival_by_sex = df.groupby('Sex')['Survived'].mean() * 100
print(f"\nSurvival rate by gender:\n{survival_by_sex}")

# Analyze survival by class
survival_by_class = df.groupby('Pclass')['Survived'].mean() * 100
print(f"\nSurvival rate by class:\n{survival_by_class}")

# Analyze age distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'].dropna(), kde=True)
plt.title('Age Distribution of Titanic Passengers')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.axvline(df['Age'].mean(), color='red', linestyle='--', label=f'Mean Age: {df["Age"].mean():.2f}')
plt.legend()
plt.show()

# Create a correlation heatmap
plt.figure(figsize=(10, 8))
correlation = df.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Features')
plt.tight_layout()
plt.show()

# Compare mean age from statistics with dataset 'Age'
print(f"\nMean age from age_stats: {age_stats['mean']:.2f}")
print(f"Mean 'Age' in the dataset: {df['Age'].mean():.2f}")

Q1

In [None]:
def calculate_age_in_100(current_age):
    """Returns the year when a person with the given age will turn 100"""
    current_year = 2025
    years_to_100 = 100 - current_age
    year_when_100 = current_year + years_to_100
    return year_when_100

# Get user input
user_name = input("Enter your name: ")
user_age = int(input("Enter your age: "))

# Calculate when the user will turn 100
year_100 = calculate_age_in_100(user_age)

# Print the result
print(f"Hello {user_name}, you will turn 100 years old in {year_100}.")

In [None]:
def generate_summary(numbers):
    """
    Generate statistical summary of a list of numbers.
    
    Parameters:
        numbers (list): List of numbers
        
    Returns:
        dict or str: Dictionary containing mean, median, and standard deviation if the list is not empty,
                    otherwise returns a message stating the list is empty.
    """
    if not numbers:
        return "List is empty."
    
    # Calculate mean
    mean = sum(numbers) / len(numbers)
    
    # Calculate median
    sorted_numbers = sorted(numbers)
    n = len(sorted_numbers)
    if n % 2 == 0:
        median = (sorted_numbers[n//2 - 1] + sorted_numbers[n//2]) / 2
    else:
        median = sorted_numbers[n//2]
    
    # Calculate standard deviation
    variance = sum((x - mean) ** 2 for x in numbers) / len(numbers)
    std_dev = variance ** 0.5
    
    return {
        'mean': mean,
        'median': median,
        'standard_deviation': std_dev
    }


try:
    result = generate_summary(test_numbers)
    print(result)
except NameError:
    print("Error: test_numbers variable is not defined.")
    # Create a sample list for demonstration
    test_numbers = [10, 20, 30, 40, 50]
    print(f"Using sample numbers instead: {test_numbers}")
    result = generate_summary(test_numbers)
    print(result)

Q2

In [None]:
import pandas as pd
import numpy as np

titanic = pd.read_csv('titanic.csv')
df = titanic

# Display the first 10 rows of the dataset
print("First 10 rows of the Titanic dataset:")
print(df.head(10))

# Display dataset information
print("\nDataset Information:")
print(df.info())

# Display descriptive statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Check for any missing values and display as percentage
missing_data = (df.isnull().sum() / len(df)) * 100
print("\nMissing Values (%):")
print(missing_data[missing_data > 0].sort_values(ascending=False))

In [None]:
# Task 3: Check data types and identify potential issues
print("Data types for each column:")
print(df.dtypes)

# Identify if any numeric columns are stored as strings/objects
numeric_cols = ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
for col in numeric_cols:
    if df[col].dtype == 'object':
        print(f"Warning: {col} is stored as object type but should be numeric")

# Task 4: Analyze missing values and suggest strategies
print("\nMissing values count and percentage:")
missing = pd.DataFrame({
    'Count': df.isnull().sum(),
    'Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})
print(missing[missing['Count'] > 0].sort_values('Percentage', ascending=False))

# Examine the age distribution to decide on imputation strategy
print("\nAge statistics:")
print(df['Age'].describe())

# Create a copy of the dataframe and apply imputation strategies
df_clean = df.copy()

# Strategy 1: Fill Age with median value grouped by Pclass and Sex
print("\nStrategy: Impute Age with median grouped by Pclass and Sex")
age_median = df.groupby(['Pclass', 'Sex'])['Age'].transform('median')
df_clean['Age'] = df['Age'].fillna(age_median)

# Strategy 2: Fill missing Embarked values with most common value
most_common_embarked = df['Embarked'].mode()[0]
df_clean['Embarked'] = df['Embarked'].fillna(most_common_embarked)

# Strategy 3: Keep Cabin as is (too many missing values to impute)
print(f"Cabin has {df['Cabin'].isnull().sum()} missing values ({df['Cabin'].isnull().sum() / len(df) * 100:.2f}%).")
print("Strategy for Cabin: Keep as is or create a binary feature 'Has_Cabin'")

# Create a binary feature for Cabin
df_clean['Has_Cabin'] = df['Cabin'].notna().astype(int)

print("\nMissing values after imputation:")
print(df_clean.isnull().sum()[df_clean.isnull().sum() > 0])

Q3

In [None]:
# Task 1: Handle missing values
# 1. Fill missing values in Age with its median
df_clean['Age'] = df['Age'].fillna(df['Age'].median())

# 2. Drop rows with missing values in Embarked
df_clean = df_clean.dropna(subset=['Embarked'])

# Task 2: Convert Sex column to numerical format
df_clean['Sex_numeric'] = df_clean['Sex'].map({'male': 0, 'female': 1})

# Task 3: Create FamilySize column
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1  # +1 to include the passenger

# Task 4: Perform additional transformations
# 1. Create FarePerPerson column
df_clean['FarePerPerson'] = df_clean['Fare'] / df_clean['FamilySize']

# 2. Remove duplicate rows based on PassengerId
df_clean = df_clean.drop_duplicates(subset=['PassengerId'])

# Display the shape of the cleaned dataset
print(f"Shape of the cleaned dataset: {df_clean.shape}")

# Display first few rows of the cleaned dataset
print("\nFirst 5 rows of the cleaned dataset:")
print(df_clean.head())

# Check if there are any remaining missing values
missing_after = df_clean.isnull().sum()
print("\nRemaining missing values:")
print(missing_after[missing_after > 0])

Q4

In [None]:
# Task 1: Plot a histogram of the Age distribution with 10-year intervals
plt.figure(figsize=(10, 6))
bins = range(0, 81, 10)  # 10-year intervals from 0 to 80
plt.hist(df_clean['Age'], bins=bins, edgecolor='black', alpha=0.7)
plt.title('Age Distribution of Titanic Passengers (10-year intervals)')
plt.xlabel('Age')
plt.ylabel('Number of Passengers')
plt.xticks(bins)
plt.grid(axis='y', alpha=0.3)
plt.show()

# Task 2: Bar plot comparing survival rate by embarkation port
plt.figure(figsize=(8, 6))
survival_by_port = df.groupby('Embarked')['Survived'].mean() * 100
survival_by_port.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Survival Rate by Embarkation Port')
plt.xlabel('Port of Embarkation (C=Cherbourg, Q=Queenstown, S=Southampton)')
plt.ylabel('Survival Rate (%)')
plt.xticks(rotation=0)
for i, rate in enumerate(survival_by_port):
    plt.text(i, rate + 1, f'{rate:.1f}%', ha='center')
plt.grid(axis='y', alpha=0.3)
plt.show()

# Task 3: Scatter plot of Fare vs Age with color indicating survival
plt.figure(figsize=(12, 8))
colors = {0: 'red', 1: 'green'}
labels = {0: 'Did Not Survive', 1: 'Survived'}

# Create scatter plot for each survival status
for survival_status in [0, 1]:
    subset = df_clean[df_clean['Survived'] == survival_status]
    plt.scatter(subset['Age'], subset['Fare'], 
                c=colors[survival_status], 
                alpha=0.6, 
                label=labels[survival_status])

plt.title('Fare vs Age by Survival Status')
plt.xlabel('Age')
plt.ylabel('Fare ($)')
plt.legend()
plt.grid(alpha=0.3)

# Set y-axis limit to better show the distribution (excluding extreme outliers)
plt.ylim(0, df_clean['Fare'].quantile(0.99))
plt.show()

# Task 4: Box plot of Fare across different passenger classes
plt.figure(figsize=(10, 6))
sns.boxplot(x='Pclass', y='Fare', data=df_clean, palette='viridis')
plt.title('Fare Distribution by Passenger Class')
plt.xlabel('Passenger Class (1 = 1st, 2 = 2nd, 3 = 3rd)')
plt.ylabel('Fare ($)')
plt.grid(axis='y', alpha=0.3)
plt.show()

Q5

In [None]:
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

# The df variable should already be defined in cell 3, but let's make sure it's accessible
# If df is not accessible, we'll reload the dataset
try:
    # Try to access df to see if it's defined
    df.head(1)
except (NameError, AttributeError):
    # If df doesn't exist or is not a DataFrame, reload it
    import pandas as pd
    titanic = pd.read_csv('titanic.csv')
    df = titanic
    print("Reloaded the Titanic dataset")

# Task 1: Calculate descriptive statistics for Age and Fare
age_stats = {
    'mean': df['Age'].mean(),
    'median': df['Age'].median(),
    'std_dev': df['Age'].std(),
    'range': df['Age'].max() - df['Age'].min(),
    'IQR': df['Age'].quantile(0.75) - df['Age'].quantile(0.25)
}

fare_stats = {
    'mean': df['Fare'].mean(),
    'median': df['Fare'].median(),
    'std_dev': df['Fare'].std(),
    'range': df['Fare'].max() - df['Fare'].min(),
    'IQR': df['Fare'].quantile(0.75) - df['Fare'].quantile(0.25)
}

# Calculate coefficient of variation to compare spread
age_stats['cv'] = age_stats['std_dev'] / age_stats['mean']
fare_stats['cv'] = fare_stats['std_dev'] / fare_stats['mean']

print("Age Statistics:")
for key, value in age_stats.items():
    print(f"{key}: {value:.2f}")

print("\nFare Statistics:")
for key, value in fare_stats.items():
    print(f"{key}: {value:.2f}")

print(f"\nColumn with greater relative spread: {'Fare' if fare_stats['cv'] > age_stats['cv'] else 'Age'}")

# Task 2: T-test for Fare between male and female passengers
male_fares = df[df['Sex'] == 'male']['Fare']
female_fares = df[df['Sex'] == 'female']['Fare']

t_stat, p_value = stats.ttest_ind(male_fares, female_fares, equal_var=False)

print("\nT-test for Fare between male and female passengers:")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Mean Fare for males: ${male_fares.mean():.2f}")
print(f"Mean Fare for females: ${female_fares.mean():.2f}")
print(f"{'There is' if p_value < 0.05 else 'There is not'} a significant difference in fares between genders (α=0.05).")

# Task 3: Chi-square test for association between Survived and Pclass
# Create contingency table
contingency = pd.crosstab(df['Survived'], df['Pclass'])
chi2, p_chi2, dof, expected = stats.chi2_contingency(contingency)

print("\nChi-square test for association between Survived and Pclass:")
print(f"Contingency table:\n{contingency}")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"p-value: {p_chi2:.4f}")
print(f"{'There is' if p_chi2 < 0.05 else 'There is not'} a significant association between survival and passenger class (α=0.05).")

# Task 4: Correlation between Age and Fare
# Create a temporary dataframe without missing values for both Age and Fare
temp_df = df.dropna(subset=['Age', 'Fare'])

# Use pearsonr for Pearson correlation coefficient and p-value
correlation, p_corr = stats.pearsonr(temp_df['Age'], temp_df['Fare'])

print("\nCorrelation between Age and Fare:")
print(f"Pearson correlation coefficient: {correlation:.4f}")
print(f"p-value: {p_corr:.4f}")

if p_corr < 0.05:
    if correlation > 0:
        relationship = "positive"
    else:
        relationship = "negative"
    significance = "statistically significant"
else:
    relationship = "no clear"
    significance = "not statistically significant"

print(f"There is a {relationship} correlation between Age and Fare, which is {significance}.")

# Visualize the relationship with a scatter plot and regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='Age', y='Fare', data=temp_df, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title('Correlation between Age and Fare')
plt.xlabel('Age')
plt.ylabel('Fare ($)')
plt.ylim(0, temp_df['Fare'].quantile(0.99))  # Limit y-axis to exclude extreme outliers
plt.grid(alpha=0.3)
plt.show()