# Titanic Dataset Analysis

这个notebook用于探索和分析Titanic数据集。我们将：
1. 加载并查看数据
2. 进行基本的数据探索
3. 分析特征之间的关系
4. 进行数据可视化

In [None]:
print('Hello, World!')

In [1]:
# 导入需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 设置显示选项
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')
%matplotlib inline

ModuleNotFoundError: No module named 'seaborn'

In [None]:
# 读取数据
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# 显示训练集的基本信息
print('Training Data Shape:', train_data.shape)
train_data.head()

In [None]:
# 查看数据基本信息
train_data.info()

In [None]:
# 查看数值型特征的统计描述
train_data.describe()

In [None]:
# 检查缺失值
missing_values = train_data.isnull().sum()
missing_percentages = (missing_values / len(train_data)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentages
})

print('\nMissing Values Analysis:')
print(missing_df[missing_df['Missing Values'] > 0])

In [None]:
# 存活率分析
survival_rate = train_data['Survived'].mean() * 100
print(f'Overall survival rate: {survival_rate:.2f}%')

# 按性别统计存活率
survival_by_sex = train_data.groupby('Sex')['Survived'].mean() * 100
print('\nSurvival rate by sex:')
print(survival_by_sex)

# 可视化性别与存活率的关系
plt.figure(figsize=(8, 6))
sns.barplot(x='Sex', y='Survived', data=train_data)
plt.title('Survival Rate by Sex')
plt.show()

In [None]:
# 年龄分布分析
plt.figure(figsize=(10, 6))
sns.histplot(data=train_data, x='Age', hue='Survived', multiple="stack")
plt.title('Age Distribution by Survival Status')
plt.show()

In [None]:
# 船票等级与存活率关系
plt.figure(figsize=(8, 6))
sns.barplot(x='Pclass', y='Survived', data=train_data)
plt.title('Survival Rate by Passenger Class')
plt.show()