In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the dataset
df = pd.read_csv("train.csv")

# Preview the first few rows
df.head(10)

In [None]:
# Extract surname from Name before dropping
df['Last_Name'] = df['Name'].str.split(',').str[0]

# Drop Name and Ticket
df.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [None]:
# Fill missing Age values with median age by Pclass
df['Age'] = df.groupby('Pclass')['Age'].transform(
    lambda x: x.fillna(x.median())
)

In [None]:
# Fill missing cabin for family members by using family's known cabin
family_cabins = df.groupby('Last_Name')['Cabin'].agg(
    lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan
)

# Fill missing cabins with family cabin if available
df['Cabin'] = df.apply(
    lambda row: family_cabins[row['Last_Name']]
    if pd.isna(row['Cabin'])
    else row['Cabin'],
    axis=1
)

# Replace any still-missing cabins with "Unknown"
df['Cabin'] = df['Cabin'].fillna("Unknown")

In [None]:
# Format Fare to 2 decimal places
df['Fare'] = df['Fare'].round(2)

In [None]:
# Remove any duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Convert data types
df['Survived'] = df['Survived'].astype(int)
df['Pclass'] = df['Pclass'].astype(int)
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
df['Cabin'] = df['Cabin'].astype(str)

In [None]:
# Drop temporary Last_Name column
df.drop('Last_Name', axis=1, inplace=True)

In [None]:
# Save the cleaned dataset
df.to_csv("titanic_cleaned_final.csv", index=False)
print("✅ Cleaned dataset saved as 'titanic_cleaned_final.csv'.")

#  - - Data Visualization - -

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("titanic_cleaned_final.csv")

# Preview the few rows
df.head(10)

### Show the relationship between a passenger's ticket class and survival rate

In [None]:
# Calculate survival rate by Pclass
pclass_survival = df.groupby('Pclass')['Survived'].mean() * 100
print("Survival rate by ticket class (%):")
print(pclass_survival)

In [None]:
# Bar plot
plt.figure(figsize=(6,4))
pclass_survival.plot(kind='bar', color=['#4e79a7', '#f28e2b', '#e15759'])
plt.title('Survival Rate by Ticket Class')
plt.xlabel('Ticket Class')
plt.ylabel('Survival Rate (%)')
plt.ylim(0, 100)
plt.xticks(rotation=0)
plt.show()

## Show the relationship between a passenger's sex and survival rate

In [None]:
women = df.loc[df.Sex == 'female']["Survived"]
rate_women = (sum(women)/len(women)) * 100

print("% of women who survived:", rate_women)

In [None]:
men = df.loc[df.Sex == 'male']["Survived"]
rate_men = (sum(men)/len(men)) * 100

print("% of men who survived:", rate_men)

In [None]:
# Bar plot
sex_rates = {'Women': rate_women, 'Men': rate_men}

plt.figure(figsize=(6,4))
plt.bar(sex_rates.keys(), sex_rates.values(), color=['#f28e2b', '#4e79a7'])
plt.title('Survival Rate by Sex')
plt.xlabel('Sex')
plt.ylabel('Survival Rate (%)')
plt.ylim(0, 100)
plt.show()

## Show the relationship between a passenger's age and survival rate.

In [None]:
# We'll bin ages for better visualization
age_bins = [0, 12, 18, 30, 50, 80]
age_labels = ['Child', 'Teen', 'Young Adult', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)
age_survival = df.groupby('AgeGroup')['Survived'].mean() * 100
print("Survival rate by age group (%):")
print(age_survival)

In [None]:
# Bar plot
plt.figure(figsize=(7,4))
age_survival.plot(kind='bar', color='#59a14f')
plt.title('Survival Rate by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Survival Rate (%)')
plt.ylim(0, 100)
plt.xticks(rotation=0)
plt.show()

## Display the relationship between the survival rate and passenger's class, sex, and age.

1. Relationship between Pclass and sex.

In [None]:
# Set up the plot style
sns.set(style="whitegrid")

plt.figure(figsize=(10, 6))
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=df)
plt.title('Survival Rate by Class and Sex')
plt.ylabel('Survival Rate')
plt.xlabel('Passenger Class')
plt.legend(title='Sex')
plt.show()

<small> We can see based on the image that females has a high surival rate in all three classes. </small>

2. Age distribution by Survival.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Age', hue='Survived', multiple='stack', bins=30)
plt.title('Age Distribution by Survival')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(title='Survived', labels=['No', 'Yes'])
plt.show()

<small>
The current visualization might be a little confusing and can be mistaken as misleading at first because children supposedly must show greater difference of survival compare to adults (refer to Survival Rate by Age Group). We'll see what are the other factors that results to this. </small>

3. Survival rate by Age, Sex, and Class

In [None]:
# Map Survived to string labels for clarity
df['Survived_str'] = df['Survived'].map({1: 'Survived', 0: 'Did Not Survive'})

# Set the hue order and palette explicitly
hue_order = ['Survived', 'Did Not Survive']
palette = {'Survived': '#4e79a7', 'Did Not Survive': '#f28e2b'}

g = sns.FacetGrid(
    df, row='Sex', col='Pclass', margin_titles=True, height=3, aspect=1.2
)
g.map_dataframe(
    sns.histplot,
    x='Age',
    hue='Survived_str',
    multiple='stack',
    bins=20,
    hue_order=hue_order,
    palette=palette
)

# Remove the default legend and add a custom one
for ax in g.axes.flat:
    ax.legend_.remove() if ax.get_legend() else None

# Add a single custom legend to the figure
import matplotlib.patches as mpatches
handles = [
    mpatches.Patch(color='#4e79a7', label='Survived (Blue)'),
    mpatches.Patch(color='#f28e2b', label='Did Not Survive (Orange)')
]
g.fig.legend(
    handles=handles,
    title='Legend',
    loc='center right',
    bbox_to_anchor=(1.13, 0.5)
)

g.set_axis_labels('Age', 'Count')
g.fig.subplots_adjust(top=0.9, right=0.85)
g.fig.suptitle('Age Distribution by Survival, Sex, and Class')
plt.show()

<small> We can see in the graph that a lot of children are from class 3, and unfortunately Pclass was a factor regardless whether you are a child. This is proven based from the higher classes, which has a few children but most of them survived.</small>

Number of passengers and age along with their survival rate

In [None]:
# Filter for children (AgeGroup == 'Child')
children = df[df['AgeGroup'] == 'Child']

# Compute survival rate for each class
child_class_survival = children.groupby('Pclass')['Survived'].mean() * 100

# Show the result
print("Survival rate of children by class (%):")
print(child_class_survival)

In [None]:
plt.figure(figsize=(6,4))
child_class_survival.plot(kind='bar', color=['#4e79a7', '#f28e2b', '#59a14f'])
plt.title('Survival Rate of Children by Class')
plt.xlabel('Passenger Class')
plt.ylabel('Survival Rate (%)')
plt.ylim(0, 100)
plt.xticks([0,1,2], ['Class 1', 'Class 2', 'Class 3'], rotation=0)
plt.show()

In [None]:
# Filter for children (AgeGroup == 'Child')
children = df[df['AgeGroup'] == 'Child']

# Count total children
total_children = len(children)

# Count survivors and non-survivors
survived_children = children['Survived'].sum()
not_survived_children = total_children - survived_children

# Compute overall survival rate
overall_survival_rate = survived_children / total_children * 100

# Print results
print(f"Total children: {total_children}")
print(f"Survived: {survived_children}")
print(f"Did not survive: {not_survived_children}")
print(f"Overall survival rate: {overall_survival_rate:.2f}%")

 *** Refer to Survival Rate by Age Group *** <br>
Conclusion: We have found out that children from class 1-2 have high survival rate compare to children from class 3. Meaning, class is still a factor regardless of whether you're a child.<br>

Apart from the little confusion in the children survival rate data. We can also see that women also has higher survival rate compare to men which applies to all Pclass. Overall, Pclass, Sex, and Age are the major factors for a passenger's total survival rate.

In [None]:
df = pd.read_csv("titanic_cleaned_final.csv")

df.head(10)

## Training the Random Forest Model

We use a Random Forest Classifier, which is an ensemble of decision trees.  
It is robust, handles both numerical and categorical data, and helps reduce overfitting.

- `model.fit(X_train, y_train)` trains the model on the training data.

In [None]:
from sklearn.model_selection import train_test_split

features = ["Pclass", "Sex", "SibSp", "Parch", "Age"]
X = pd.get_dummies(df[features])
y = df["Survived"]

# Split data for validation (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Training the Random Forest Model

We use a Random Forest Classifier, which is an ensemble of decision trees.  
It is robust, handles both numerical and categorical data, and helps reduce overfitting.

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)

## 3. Evaluating Model Performance

After training, we predict the outcomes for the validation set and evaluate the results:

- **Validation Accuracy:** The proportion of correct predictions on the validation set.
- **Confusion Matrix:** Shows the counts of true positives, true negatives, false positives, and false negatives.
- **Classification Report:** Includes precision, recall, and F1-score for each class (survived/did not survive).

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

## 4. Feature Importance

Random Forests can tell us which features were most important for making predictions.

- The bar plot shows the relative importance of each feature.
- Features with higher bars contributed more to the model's decisions.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

feat_imp = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(8,4))
feat_imp.plot(kind='bar')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

# Task
Perform inferential statistics to find the relationship between 'Sex' and survival rate, 'Pclass' and survival rate, and 'Age' and survival rate using the dataset "train.csv". Create detailed markdown explanations for each analysis, including the goal of the function, the meaning of the results, and a summary of the findings.

## Inferential statistics for sex and survival

### Subtask:
Perform a statistical test (e.g., chi-squared test) to examine the relationship between Sex and Survival. Include markdown explaining the test's goal, interpretation of the p-value, and conclusion based on the result.


**Reasoning**:
The subtask requires performing a chi-squared test on the 'Sex' and 'Survived' columns to determine their relationship. This involves creating a contingency table and then applying the chi-squared test function.



In [None]:
from scipy.stats import chi2_contingency
import pandas as pd

# Create a contingency table of Sex and Survived
contingency_table_sex = pd.crosstab(df['Sex'], df['Survived'])

# Perform the chi-squared test
chi2_sex, p_sex, dof_sex, expected_sex = chi2_contingency(contingency_table_sex)

# Print the results
print("Chi-squared test results for Sex and Survival:")
print(f"Chi-squared statistic: {chi2_sex}")
print(f"P-value: {p_sex}")
print(f"Degrees of freedom: {dof_sex}")
print("Expected frequencies:\n", expected_sex)

**Reasoning**:
The previous command failed because the dataframe `df` was not defined in the current code block's scope. I need to reload the dataframe from the saved CSV file before performing the chi-squared test.



In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Load the cleaned dataset
df = pd.read_csv("titanic_cleaned_final.csv")

# Create a contingency table of Sex and Survived
contingency_table_sex = pd.crosstab(df['Sex'], df['Survived'])

# Perform the chi-squared test
chi2_sex, p_sex, dof_sex, expected_sex = chi2_contingency(contingency_table_sex)

# Print the results
print("Chi-squared test results for Sex and Survival:")
print(f"Chi-squared statistic: {chi2_sex}")
print(f"P-value: {p_sex}")
print(f"Degrees of freedom: {dof_sex}")
print("Expected frequencies:\n", expected_sex)

**Reasoning**:
The previous command failed because the file 'titanic_cleaned_final.csv' was not found. Looking at the previous cells, it seems the file was saved to the default location. I will try loading the file again using the same filename.



In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Load the cleaned dataset
df = pd.read_csv("titanic_cleaned_final.csv")

# Create a contingency table of Sex and Survived
contingency_table_sex = pd.crosstab(df['Sex'], df['Survived'])

# Perform the chi-squared test
chi2_sex, p_sex, dof_sex, expected_sex = chi2_contingency(contingency_table_sex)

# Print the results
print("Chi-squared test results for Sex and Survival:")
print(f"Chi-squared statistic: {chi2_sex}")
print(f"P-value: {p_sex}")
print(f"Degrees of freedom: {dof_sex}")
print("Expected frequencies:\n", expected_sex)

**Reasoning**:
The 'titanic_cleaned_final.csv' file was not found in the current directory. Looking back at the notebook, the data was originally loaded from 'train.csv' and then cleaned. I will load the original 'train.csv' file and perform the necessary cleaning steps again to ensure the dataframe is available for the statistical tests.



In [None]:
# Load the dataset
df = pd.read_csv("train.csv")

# Extract surname from Name before dropping
df['Last_Name'] = df['Name'].str.split(',').str[0]

# Drop Name and Ticket
df.drop(['Name', 'Ticket'], axis=1, inplace=True)

# Fill missing Age values with median age by Pclass
df['Age'] = df.groupby('Pclass')['Age'].transform(
    lambda x: x.fillna(x.median())
)

# Fill missing cabin for family members by using family's known cabin
family_cabins = df.groupby('Last_Name')['Cabin'].agg(
    lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan
)

# Fill missing cabins with family cabin if available
df['Cabin'] = df.apply(
    lambda row: family_cabins[row['Last_Name']]
    if pd.isna(row['Cabin'])
    else row['Cabin'],
    axis=1
)

# Replace any still-missing cabins with "Unknown"
df['Cabin'] = df['Cabin'].fillna("Unknown")

# Format Fare to 2 decimal places
df['Fare'] = df['Fare'].round(2)

# Remove any duplicate rows
df.drop_duplicates(inplace=True)

# Convert data types
df['Survived'] = df['Survived'].astype(int)
df['Pclass'] = df['Pclass'].astype(int)
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
df['Cabin'] = df['Cabin'].astype(str)

# Drop temporary Last_Name column
df.drop('Last_Name', axis=1, inplace=True)

from scipy.stats import chi2_contingency

# Create a contingency table of Sex and Survived
contingency_table_sex = pd.crosstab(df['Sex'], df['Survived'])

# Perform the chi-squared test
chi2_sex, p_sex, dof_sex, expected_sex = chi2_contingency(contingency_table_sex)

# Print the results
print("Chi-squared test results for Sex and Survival:")
print(f"Chi-squared statistic: {chi2_sex}")
print(f"P-value: {p_sex}")
print(f"Degrees of freedom: {dof_sex}")
print("Expected frequencies:\n", expected_sex)

**Reasoning**:
The previous command failed because `np` was not defined. I need to import `numpy` as `np` to use `np.nan` in the cleaning steps. I will include this import and repeat the loading and cleaning process, followed by the chi-squared test.



In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Load the dataset
df = pd.read_csv("train.csv")

# Extract surname from Name before dropping
df['Last_Name'] = df['Name'].str.split(',').str[0]

# Drop Name and Ticket
df.drop(['Name', 'Ticket'], axis=1, inplace=True)

# Fill missing Age values with median age by Pclass
df['Age'] = df.groupby('Pclass')['Age'].transform(
    lambda x: x.fillna(x.median())
)

# Fill missing cabin for family members by using family's known cabin
family_cabins = df.groupby('Last_Name')['Cabin'].agg(
    lambda x: x.dropna().iloc[0] if x.notna().any() else np.nan
)

# Fill missing cabins with family cabin if available
df['Cabin'] = df.apply(
    lambda row: family_cabins[row['Last_Name']]
    if pd.isna(row['Cabin'])
    else row['Cabin'],
    axis=1
)

# Replace any still-missing cabins with "Unknown"
df['Cabin'] = df['Cabin'].fillna("Unknown")

# Format Fare to 2 decimal places
df['Fare'] = df['Fare'].round(2)

# Remove any duplicate rows
df.drop_duplicates(inplace=True)

# Convert data types
df['Survived'] = df['Survived'].astype(int)
df['Pclass'] = df['Pclass'].astype(int)
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
df['Cabin'] = df['Cabin'].astype(str)

# Drop temporary Last_Name column
df.drop('Last_Name', axis=1, inplace=True)


# Create a contingency table of Sex and Survived
contingency_table_sex = pd.crosstab(df['Sex'], df['Survived'])

# Perform the chi-squared test
chi2_sex, p_sex, dof_sex, expected_sex = chi2_contingency(contingency_table_sex)

# Print the results
print("Chi-squared test results for Sex and Survival:")
print(f"Chi-squared statistic: {chi2_sex}")
print(f"P-value: {p_sex}")
print(f"Degrees of freedom: {dof_sex}")
print("Expected frequencies:\n", expected_sex)

**Reasoning**:
The chi-squared test for Sex and Survival has been performed and the results are printed. The next step according to the instructions is to create a markdown cell to explain the test, interpret the p-value, and state the conclusion.



In [None]:
### Statistical Relationship between Sex and Survival (Chi-Squared Test)

**Goal:**

We performed a chi-squared test of independence to examine if there is a statistically significant relationship between a passenger's sex and their survival status on the Titanic. The null hypothesis ($H_0$) is that Sex and Survival are independent, meaning there is no relationship between the two variables. The alternative hypothesis ($H_1$) is that Sex and Survival are dependent, meaning there is a relationship.

**Interpretation of the P-value:**

The p-value represents the probability of observing the data (or more extreme data) if the null hypothesis were true.

- If the p-value is less than our chosen significance level (alpha, commonly 0.05), we reject the null hypothesis. This suggests that the observed relationship between the variables is unlikely to be due to random chance.
- If the p-value is greater than alpha, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude a statistically significant relationship between the variables.

**Conclusion:**

The chi-squared test yielded a p-value of approximately $1.2 \times 10^{-58}$. With a significance level of 0.05, this p-value is extremely small and significantly less than 0.05. Therefore, we reject the null hypothesis.

This indicates that there is a statistically significant relationship between Sex and Survival on the Titanic. The survival rate is highly dependent on a passenger's sex, with females having a significantly higher survival rate than males, as observed in the earlier visualizations.


**Reasoning**:
The previous command failed because it was a markdown cell content placed within a code block. I need to output the markdown content as a separate markdown cell. I will now create the markdown cell with the explanation of the chi-squared test results.



In [None]:
```markdown
### Statistical Relationship between Sex and Survival (Chi-Squared Test)

**Goal:**

We performed a chi-squared test of independence to examine if there is a statistically significant relationship between a passenger's sex and their survival status on the Titanic. The null hypothesis ($H_0$) is that Sex and Survival are independent, meaning there is no relationship between the two variables. The alternative hypothesis ($H_1$) is that Sex and Survival are dependent, meaning there is a relationship.

**Interpretation of the P-value:**

The p-value represents the probability of observing the data (or more extreme data) if the null hypothesis were true.

- If the p-value is less than our chosen significance level (alpha, commonly 0.05), we reject the null hypothesis. This suggests that the observed relationship between the variables is unlikely to be due to random chance.
- If the p-value is greater than alpha, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude a statistically significant relationship between the variables.

**Conclusion:**

The chi-squared test yielded a p-value of approximately $1.2 \times 10^{-58}$. With a significance level of 0.05, this p-value is extremely small and significantly less than 0.05. Therefore, we reject the null hypothesis.

This indicates that there is a statistically significant relationship between Sex and Survival on the Titanic. The survival rate is highly dependent on a passenger's sex, with females having a significantly higher survival rate than males, as observed in the earlier visualizations.


## Inferential statistics for pclass and survival

### Subtask:
Inferential statistics for pclass and survival


**Reasoning**:
Create a contingency table for 'Pclass' and 'Survived' and perform the chi-squared test to examine their relationship.



In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table of Pclass and Survived
contingency_table_pclass = pd.crosstab(df['Pclass'], df['Survived'])

# Perform the chi-squared test
chi2_pclass, p_pclass, dof_pclass, expected_pclass = chi2_contingency(contingency_table_pclass)

# Print the results
print("Chi-squared test results for Pclass and Survival:")
print(f"Chi-squared statistic: {chi2_pclass}")
print(f"P-value: {p_pclass}")
print(f"Degrees of freedom: {dof_pclass}")
print("Expected frequencies:\n", expected_pclass)

**Reasoning**:
Create a markdown cell to explain the chi-squared test for Pclass and Survival, including the goal, interpretation of the p-value, and conclusion.



In [None]:
```markdown
### Statistical Relationship between Pclass and Survival (Chi-Squared Test)

**Goal:**

We performed a chi-squared test of independence to examine if there is a statistically significant relationship between a passenger's ticket class (Pclass) and their survival status on the Titanic. The null hypothesis ($H_0$) is that Pclass and Survival are independent, meaning there is no relationship between the two variables. The alternative hypothesis ($H_1$) is that Pclass and Survival are dependent, meaning there is a relationship.

**Interpretation of the P-value:**

The p-value represents the probability of observing the data (or more extreme data) if the null hypothesis were true.

- If the p-value is less than our chosen significance level (alpha, commonly 0.05), we reject the null hypothesis. This suggests that the observed relationship between the variables is unlikely to be due to random chance.
- If the p-value is greater than alpha, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude a statistically significant relationship between the variables.

**Conclusion:**

The chi-squared test yielded a p-value of approximately $4.55 \times 10^{-23}$. With a significance level of 0.05, this p-value is extremely small and significantly less than 0.05. Therefore, we reject the null hypothesis.

This indicates that there is a statistically significant relationship between Pclass and Survival on the Titanic. The survival rate is highly dependent on a passenger's ticket class, with passengers in higher classes (1 and 2) having a significantly higher survival rate than those in lower classes (3), as observed in the earlier visualizations.

**Reasoning**:
The previous command failed because the markdown content was incorrectly placed within a code block. I will try again, ensuring the markdown is correctly formatted in a markdown cell.



In [None]:
# This is an empty code block to allow the markdown cell to be created next.

**Reasoning**:
Create a markdown cell to explain the chi-squared test for Pclass and Survival, including the goal, interpretation of the p-value, and conclusion, as the previous attempt failed due to incorrect formatting.



In [None]:
```markdown
### Statistical Relationship between Pclass and Survival (Chi-Squared Test)

**Goal:**

We performed a chi-squared test of independence to examine if there is a statistically significant relationship between a passenger's ticket class (Pclass) and their survival status on the Titanic. The null hypothesis ($H_0$) is that Pclass and Survival are independent, meaning there is no relationship between the two variables. The alternative hypothesis ($H_1$) is that Pclass and Survival are dependent, meaning there is a relationship.

**Interpretation of the P-value:**

The p-value represents the probability of observing the data (or more extreme data) if the null hypothesis were true.

- If the p-value is less than our chosen significance level (alpha, commonly 0.05), we reject the null hypothesis. This suggests that the observed relationship between the variables is unlikely to be due to random chance.
- If the p-value is greater than alpha, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude a statistically significant relationship between the variables.

**Conclusion:**

The chi-squared test yielded a chi-squared statistic of approximately 102.89 and a p-value of approximately $4.55 \times 10^{-23}$, with 2 degrees of freedom. With a significance level of 0.05, this p-value is extremely small and significantly less than 0.05. Therefore, we reject the null hypothesis.

This indicates that there is a statistically significant relationship between Pclass and Survival on the Titanic. The survival rate is highly dependent on a passenger's ticket class, with passengers in higher classes (1 and 2) having a significantly higher survival rate than those in lower classes (3), as observed in the earlier visualizations.

**Reasoning**:
The previous attempts to create a markdown cell failed because the markdown content was incorrectly formatted within a code block. I will try again, this time ensuring the markdown is correctly placed in a markdown cell.



In [None]:
# This is an empty code block to allow the markdown cell to be created next.

**Reasoning**:
Create a markdown cell to explain the chi-squared test for Pclass and Survival, including the goal, interpretation of the p-value, and conclusion.



In [None]:
```markdown
### Statistical Relationship between Pclass and Survival (Chi-Squared Test)

**Goal:**

We performed a chi-squared test of independence to examine if there is a statistically significant relationship between a passenger's ticket class (Pclass) and their survival status on the Titanic. The null hypothesis ($H_0$) is that Pclass and Survival are independent, meaning there is no relationship between the two variables. The alternative hypothesis ($H_1$) is that Pclass and Survival are dependent, meaning there is a relationship.

**Interpretation of the P-value:**

The p-value represents the probability of observing the data (or more extreme data) if the null hypothesis were true.

- If the p-value is less than our chosen significance level (alpha, commonly 0.05), we reject the null hypothesis. This suggests that the observed relationship between the variables is unlikely to be due to random chance.
- If the p-value is greater than alpha, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude a statistically significant relationship between the variables.

**Conclusion:**

The chi-squared test yielded a chi-squared statistic of approximately 102.89 and a p-value of approximately $4.55 \times 10^{-23}$, with 2 degrees of freedom. With a significance level of 0.05, this p-value is extremely small and significantly less than 0.05. Therefore, we reject the null hypothesis.

This indicates that there is a statistically significant relationship between Pclass and Survival on the Titanic. The survival rate is highly dependent on a passenger's ticket class, with passengers in higher classes (1 and 2) having a significantly higher survival rate than those in lower classes (3), as observed in the earlier visualizations.

## Inferential statistics for age and survival

### Subtask:
Perform a statistical test (e.g., t-test or ANOVA depending on how Age is treated) to examine the relationship between Age and Survival. Include markdown explaining the test's goal, interpretation of the p-value, and conclusion based on the result.


**Reasoning**:
Separate the 'Age' data into two groups based on the 'Survived' column and perform an independent samples t-test. Then print the results.



In [None]:
from scipy.stats import ttest_ind

# Separate Age data into two groups based on Survived status
age_survived = df[df['Survived'] == 1]['Age'].dropna()
age_not_survived = df[df['Survived'] == 0]['Age'].dropna()

# Perform independent samples t-test
ttest_result = ttest_ind(age_survived, age_not_survived, equal_var=False) # Assuming unequal variances, as is common with biological data

# Print the results
print("Independent Samples t-test results for Age and Survival:")
print(f"T-statistic: {ttest_result.statistic}")
print(f"P-value: {ttest_result.pvalue}")

**Reasoning**:
Create a markdown cell explaining the t-test's goal, p-value interpretation, and conclusion.



In [None]:
# This is an empty code block to allow the markdown cell to be created next.

**Reasoning**:
Create a markdown cell explaining the goal of the t-test, the interpretation of the p-value, and the conclusion based on the results for the relationship between Age and Survival.



In [None]:
```markdown
### Statistical Relationship between Age and Survival (Independent Samples t-test)

**Goal:**

We performed an independent samples t-test to determine if there is a statistically significant difference in the mean age between passengers who survived and those who did not survive. The null hypothesis ($H_0$) is that there is no significant difference in the mean age between the two groups. The alternative hypothesis ($H_1$) is that there is a significant difference in the mean age.

**Interpretation of the P-value:**

The p-value from the t-test indicates the probability of observing a difference in sample means as large as (or larger than) the one calculated, assuming the null hypothesis (no difference in population means) is true.

- If the p-value is less than our chosen significance level (alpha, commonly 0.05), we reject the null hypothesis. This suggests that the observed difference in mean age between the two groups is statistically significant and unlikely due to random chance.
- If the p-value is greater than alpha, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude a statistically significant difference in mean age between the two groups.

**Conclusion:**

The independent samples t-test comparing the mean age of survivors and non-survivors yielded a p-value of approximately 0.167. With a significance level of 0.05, this p-value is greater than 0.05. Therefore, we fail to reject the null hypothesis.

Based on this t-test, there is not enough statistical evidence to conclude that there is a significant difference in the mean age between passengers who survived and those who did not survive. While the visualization might suggest some age-related patterns (like children having higher survival rates in certain classes), the overall mean age difference between the broad categories of 'Survived' and 'Did Not Survive' is not statistically significant at the 0.05 level based on this test. Other factors or a different approach (like comparing specific age groups) might reveal more nuanced relationships.

**Reasoning**:
The previous attempt to create a markdown cell failed due to a `SyntaxError`. I will try again, ensuring the markdown content is not within a Python code block. I will use an empty code block first to ensure the markdown cell is generated correctly.



In [None]:
# This is an empty code block to allow the markdown cell to be created next.

**Reasoning**:
Create a markdown cell explaining the goal of the t-test, the interpretation of the p-value, and the conclusion based on the results for the relationship between Age and Survival.



In [None]:
```markdown
### Statistical Relationship between Age and Survival (Independent Samples t-test)

**Goal:**

We performed an independent samples t-test to determine if there is a statistically significant difference in the mean age between passengers who survived and those who did not survive. The null hypothesis ($H_0$) is that there is no significant difference in the mean age between the two groups. The alternative hypothesis ($H_1$) is that there is a significant difference in the mean age.

**Interpretation of the P-value:**

The p-value from the t-test indicates the probability of observing a difference in sample means as large as (or larger than) the one calculated, assuming the null hypothesis (no difference in population means) is true.

- If the p-value is less than our chosen significance level (alpha, commonly 0.05), we reject the null hypothesis. This suggests that the observed difference in mean age between the two groups is statistically significant and unlikely due to random chance.
- If the p-value is greater than alpha, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude a statistically significant difference in mean age between the two groups.

**Conclusion:**

The independent samples t-test comparing the mean age of survivors and non-survivors yielded a p-value of approximately 0.167. With a significance level of 0.05, this p-value is greater than 0.05. Therefore, we fail to reject the null hypothesis.

Based on this t-test, there is not enough statistical evidence to conclude that there is a significant difference in the mean age between passengers who survived and those who did not survive. While the visualization might suggest some age-related patterns (like children having higher survival rates in certain classes), the overall mean age difference between the broad categories of 'Survived' and 'Did Not Survive' is not statistically significant at the 0.05 level based on this test. Other factors or a different approach (like comparing specific age groups) might reveal more nuanced relationships.

## Summarize findings

### Subtask:
Create a markdown cell to summarize the key findings from the inferential statistical tests regarding the relationship of Sex, Pclass, and Age with survival rate.


**Reasoning**:
Create a markdown cell to summarize the findings from the inferential statistical tests.



In [None]:
```markdown
### Summary of Inferential Statistical Findings

We performed inferential statistical tests to investigate the relationship between several passenger characteristics ('Sex', 'Pclass', and 'Age') and their survival rate on the Titanic.

1.  **Sex and Survival (Chi-Squared Test):**
    *   The chi-squared test for independence between 'Sex' and 'Survival' yielded a highly statistically significant result (p-value < 0.05).
    *   This indicates a strong relationship between a passenger's sex and their likelihood of survival. As observed in the descriptive statistics and visualizations, females had a significantly higher survival rate than males.

2.  **Pclass and Survival (Chi-Squared Test):**
    *   The chi-squared test for independence between 'Pclass' (ticket class) and 'Survival' also resulted in a highly statistically significant finding (p-value < 0.05).
    *   This demonstrates a significant relationship between a passenger's ticket class and their survival outcome. Passengers in higher classes (1st and 2nd) had substantially better survival rates compared to those in 3rd class.

3.  **Age and Survival (Independent Samples t-test):**
    *   An independent samples t-test was conducted to compare the mean age of survivors and non-survivors.
    *   The test result showed a p-value of approximately 0.167, which is greater than the conventional significance level of 0.05.
    *   Based on this test, we do not have sufficient statistical evidence to conclude that there is a significant difference in the *mean* age between the overall groups of survivors and non-survivors. While visualizations showed survival differences in specific age *groups* (like children), the t-test on the overall means did not find a statistically significant difference.

**Overall Conclusion:**

The inferential statistical tests confirm that **Sex** and **Pclass** were highly statistically significant factors influencing survival rates on the Titanic. A passenger's sex and the class they traveled in had a strong relationship with their likelihood of surviving the disaster. The difference in the *mean* age between all survivors and all non-survivors, as measured by the independent samples t-test, was not found to be statistically significant at the 0.05 level, though this doesn't preclude age being a factor within specific subgroups or when analyzed differently.

**Reasoning**:
Create a markdown cell to summarize the findings from the inferential statistical tests.



In [None]:
```markdown
### Summary of Inferential Statistical Findings

We performed inferential statistical tests to investigate the relationship between several passenger characteristics ('Sex', 'Pclass', and 'Age') and their survival rate on the Titanic.

1.  **Sex and Survival (Chi-Squared Test):**
    *   The chi-squared test for independence between 'Sex' and 'Survival' yielded a highly statistically significant result (p-value < 0.05).
    *   This indicates a strong relationship between a passenger's sex and their likelihood of survival. As observed in the descriptive statistics and visualizations, females had a significantly higher survival rate than males.

2.  **Pclass and Survival (Chi-Squared Test):**
    *   The chi-squared test for independence between 'Pclass' (ticket class) and 'Survival' also resulted in a highly statistically significant finding (p-value < 0.05).
    *   This demonstrates a significant relationship between a passenger's ticket class and their survival outcome. Passengers in higher classes (1st and 2nd) had substantially better survival rates compared to those in 3rd class.

3.  **Age and Survival (Independent Samples t-test):**
    *   An independent samples t-test was conducted to compare the mean age of survivors and non-survivors.
    *   The test result showed a p-value of approximately 0.167, which is greater than the conventional significance level of 0.05.
    *   Based on this test, we do not have sufficient statistical evidence to conclude that there is a significant difference in the *mean* age between the overall groups of survivors and non-survivors. While visualizations showed survival differences in specific age *groups* (like children), the t-test on the overall means did not find a statistically significant difference.

**Overall Conclusion:**

The inferential statistical tests confirm that **Sex** and **Pclass** were highly statistically significant factors influencing survival rates on the Titanic. A passenger's sex and the class they traveled in had a strong relationship with their likelihood of surviving the disaster. The difference in the *mean* age between all survivors and all non-survivors, as measured by the independent samples t-test, was not found to be statistically significant at the 0.05 level, though this doesn't preclude age being a factor within specific subgroups or when analyzed differently.

**Reasoning**:
Create a markdown cell to summarize the findings from the inferential statistical tests.



In [None]:
```markdown
### Summary of Inferential Statistical Findings

We performed inferential statistical tests to investigate the relationship between several passenger characteristics ('Sex', 'Pclass', and 'Age') and their survival rate on the Titanic.

1.  **Sex and Survival (Chi-Squared Test):**
    *   The chi-squared test for independence between 'Sex' and 'Survival' yielded a highly statistically significant result (p-value < 0.05).
    *   This indicates a strong relationship between a passenger's sex and their likelihood of survival. As observed in the descriptive statistics and visualizations, females had a significantly higher survival rate than males.

2.  **Pclass and Survival (Chi-Squared Test):**
    *   The chi-squared test for independence between 'Pclass' (ticket class) and 'Survival' also resulted in a highly statistically significant finding (p-value < 0.05).
    *   This demonstrates a significant relationship between a passenger's ticket class and their survival outcome. Passengers in higher classes (1st and 2nd) had substantially better survival rates compared to those in 3rd class.

3.  **Age and Survival (Independent Samples t-test):**
    *   An independent samples t-test was conducted to compare the mean age of survivors and non-survivors.
    *   The test result showed a p-value of approximately 0.167, which is greater than the conventional significance level of 0.05.
    *   Based on this test, we do not have sufficient statistical evidence to conclude that there is a significant difference in the *mean* age between the overall groups of survivors and non-survivors. While visualizations showed survival differences in specific age *groups* (like children), the t-test on the overall means did not find a statistically significant difference.

**Overall Conclusion:**

The inferential statistical tests confirm that **Sex** and **Pclass** were highly statistically significant factors influencing survival rates on the Titanic. A passenger's sex and the class they traveled in had a strong relationship with their likelihood of surviving the disaster. The difference in the *mean* age between all survivors and all non-survivors, as measured by the independent samples t-test, was not found to be statistically significant at the 0.05 level, though this doesn't preclude age being a factor within specific subgroups or when analyzed differently.

## Summary:

### Data Analysis Key Findings

*   The chi-squared test for independence between 'Sex' and 'Survival' resulted in a highly significant p-value ($1.2 \times 10^{-58}$), indicating a strong relationship between sex and survival rate. Females had a significantly higher survival rate than males.
*   The chi-squared test for independence between 'Pclass' and 'Survival' also showed a highly significant p-value ($4.55 \times 10^{-23}$), demonstrating a significant relationship between ticket class and survival outcome. Passengers in 1st and 2nd class had better survival rates than those in 3rd class.
*   An independent samples t-test comparing the mean age of survivors and non-survivors yielded a p-value of approximately 0.167. At the 0.05 significance level, this is not statistically significant, suggesting no statistically significant difference in the *mean* age between the overall groups of survivors and non-survivors.

### Insights or Next Steps

*   Sex and Pclass were statistically confirmed as strong predictors of survival on the Titanic.
*   While the t-test on overall mean age was not significant, exploring age as a categorical variable (e.g., child vs. adult) or within specific Pclass groups might reveal more nuanced age-related survival patterns.
