In [3]:
import pandas as pd

In [17]:
# load data set into instance
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [18]:
print(train_data.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [19]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [20]:
print(train_data['Survived'].value_counts())

Survived
0    549
1    342
Name: count, dtype: int64


In [21]:
print(train_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# first peek only select features that intuitionlly more realte with the 'res' coause we only have 891 data to train
# more features might bring more noise which is unnecessary.
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train_data[features]
y = train_data['Survived']

# handle contionius data 
# imputer is for fill the missing/null values in the data.
# scaler | standardScaler is convert data into N(0,1) distribution
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# handle discrete data.
# one hot will split diff category into diff column like [ Pclass_1 | Pclass_2 ]
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])



preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# 将数据分为训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# for valid how preprocessor work purpose.
X_train_transformed = preprocessor.fit_transform(X_train)
# orginal features name
num_features_names = numerical_features
# one hot column name
cat_features_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
# merage all columns
all_features_names = list(num_features_names) + list(cat_features_names)
# create new DataFrame for display
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=all_features_names)
# print head of data after transform
print("Transformed Training Data Head:")
print(X_train_transformed_df.head())

Transformed Training Data Head:
        Age     SibSp     Parch      Fare  Pclass_1  Pclass_2  Pclass_3  \
0  1.232263 -0.470722 -0.479342 -0.078684       1.0       0.0       0.0   
1 -0.500482 -0.470722 -0.479342 -0.377145       0.0       1.0       0.0   
2  0.192616 -0.470722 -0.479342 -0.474867       0.0       0.0       1.0   
3 -0.269449  0.379923 -0.479342 -0.476230       0.0       0.0       1.0   
4 -1.809667  2.931860  2.048742 -0.025249       0.0       0.0       1.0   

   Sex_female  Sex_male  Embarked_C  Embarked_Q  Embarked_S  
0         0.0       1.0         0.0         0.0         1.0  
1         0.0       1.0         0.0         0.0         1.0  
2         0.0       1.0         0.0         0.0         1.0  
3         0.0       1.0         0.0         0.0         1.0  
4         1.0       0.0         0.0         0.0         1.0  


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 创建逻辑回归模型
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

# 训练模型
model.fit(X_train, y_train)

# 在验证集上进行预测
y_pred = model.predict(X_val)

# 评估模型
accuracy = accuracy_score(y_val, y_pred)
print(f'验证集准确率: {accuracy:.2f}')

验证集准确率: 0.80


In [25]:
# 预处理测试数据
X_test = test_data[features]

# 进行预测
test_preds = model.predict(X_test)

# 准备提交文件
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': test_preds})
output.to_csv('submission.csv', index=False)
print("提交文件已创建！")

提交文件已创建！
