In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv("./train.csv")
test_df =pd.read_csv("./test.csv")

In [None]:
print(train_df.info())
print(test_df.info())

##### 查看各列属性的基本统计信息

In [None]:
print(train_df.describe())
print(test_df.describe())

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_df.head()

In [None]:
train_df['HomePlanet'].unique()

In [None]:
#探究连续数据下的人数分布
fig, ax = plt.subplots(5,1,  figsize=(10, 10))
plt.subplots_adjust(top = 2)

sns.histplot(train_df['Age'], color='b', bins=50, ax=ax[0]);
sns.histplot(train_df['FoodCourt'], color='b', bins=50, ax=ax[1]);
sns.histplot(train_df['ShoppingMall'], color='b', bins=50, ax=ax[2]);
sns.histplot(train_df['Spa'], color='b', bins=50, ax=ax[3]);
sns.histplot(train_df['VRDeck'], color='b', bins=50, ax=ax[4]);

In [None]:
label = "Transported"
train_df[label] = train_df[label].astype(int)

In [None]:
#去掉ID与名字
train_df = train_df.drop(['PassengerId', 'Name'], axis=1)
train_df.head(5)

In [None]:
#拆分舱位
train_df[["Deck", "Cabin_num", "Side"]] = train_df["Cabin"].str.split("/", expand=True)
train_df = train_df.drop('Cabin', axis=1)
train_df.head(5)

In [None]:
train_df['Cabin_num'] = train_df['Cabin_num'].astype(float)
train_df.head(5)

In [None]:
#观察各个因素对于传送的影响

# 绘图
fig, axs = plt.subplots(13, 1, figsize=(10, 14))
plt.subplots_adjust(top = 5)
# 母星和传送的关系
pd.crosstab(train_df['HomePlanet'], train_df['Transported']).plot(kind='bar', stacked=True, ax=axs[0])
axs[0].set_title('Transport distribution by HomePlanet')

# 休眠与传送的关系
pd.crosstab(train_df['CryoSleep'], train_df['Transported']).plot(kind='bar', stacked=True, ax=axs[1])
axs[1].set_title('Transport distribution by CryoSleep')

# 目的地与传送的关系
pd.crosstab(train_df['Destination'], train_df['Transported']).plot(kind='bar', stacked=True, ax=axs[2])
axs[2].set_title('Transport distribution by Destination')

#年龄段与传送的关系
train_df['Age_group'] = pd.cut(train_df['Age'], bins=range(0, int(np.ceil(train_df['Age'].max())), 10))
pd.crosstab(train_df['Age_group'], train_df['Transported']).plot(kind='bar', stacked=True, ax=axs[3])
axs[3].set_title('Transport distribution by Age group')

#VIP与传送的关系
pd.crosstab(train_df['VIP'], train_df['Transported']).plot(kind='bar', stacked=True, ax=axs[4])
axs[4].set_title('Transport distribution by VIP')

#各消费与传送的关系
#RoomService
sns.kdeplot(data=train_df['RoomService'][train_df['Transported'] ==1], label='Transported', ax=axs[5])
sns.kdeplot(data=train_df['RoomService'][train_df['Transported'] == 0], label='Not Transported', ax=axs[5])
axs[5].legend()
axs[5].set_xlabel('RoomService')
axs[5].set_ylabel('Density')

#FoodCourt
sns.kdeplot(data=train_df['FoodCourt'][train_df['Transported'] ==1], label='Transported', ax=axs[6])
sns.kdeplot(data=train_df['FoodCourt'][train_df['Transported'] == 0], label='Not Transported', ax=axs[6])
axs[6].legend()
axs[6].set_xlabel('FoodCourt')
axs[6].set_ylabel('Density')

#ShoppingMall
sns.kdeplot(data=train_df['ShoppingMall'][train_df['Transported'] ==1], label='Transported', ax=axs[7])
sns.kdeplot(data=train_df['ShoppingMall'][train_df['Transported'] == 0], label='Not Transported', ax=axs[7])
axs[7].legend()
axs[7].set_xlabel('ShoppingMall')
axs[7].set_ylabel('Density')

#Spa
sns.kdeplot(data=train_df['Spa'][train_df['Transported'] ==1], label='Transported', ax=axs[8])
sns.kdeplot(data=train_df['Spa'][train_df['Transported'] == 0], label='Not Transported', ax=axs[8])
axs[8].legend()
axs[8].set_xlabel('Spa')
axs[8].set_ylabel('Density')

#VRDeck
sns.kdeplot(data=train_df['VRDeck'][train_df['Transported'] ==1], label='Transported', ax=axs[9])
sns.kdeplot(data=train_df['VRDeck'][train_df['Transported'] == 0], label='Not Transported', ax=axs[9])
axs[9].legend()
axs[9].set_xlabel('VRDeck')
axs[9].set_ylabel('Density')

#Deck,Cabin_num与Side与传送的关系
#Deck
pd.crosstab(train_df['Deck'], train_df['Transported']).plot(kind='bar', stacked=True, ax=axs[10])
axs[10].set_title('Transport distribution by Deck')

#Cabin_num
sns.kdeplot(data=train_df['Cabin_num'][train_df['Transported']== 1], label='Transported', ax=axs[11])
sns.kdeplot(data=train_df['Cabin_num'][train_df['Transported']== 0], label='Not Transported', ax=axs[11])
axs[11].legend()
axs[11].set_xlabel('Cabin_num')
axs[11].set_ylabel('Density')

#Side
pd.crosstab(train_df['Side'], train_df['Transported']).plot(kind='bar', stacked=True, ax=axs[12])
axs[12].set_title('Transport distribution by Side')

plt.plot()

我们通过上面的分布可以看出，HomePlanet,CryoSleep,Roomservice,Spa,VRdeck与transport相关性较强；
cabin_num，Side，Destination,VIP,Age,FoodCourt,ShoppingMall,Deck与transport相关性中等；

In [None]:
# 查看训练集与测试集缺失率
print(test_df.isnull().sum().sort_values(ascending = False) / test_df.shape[0])
print(train_df.isnull().sum().sort_values(ascending = False) / train_df.shape[0])


In [None]:
train_df=train_df.drop(['Age_group'], axis=1)

In [None]:
#使用众数填充相关性中等的数据，用0填充连续型数据缺失值
train_df['Destination'].fillna(train_df['Destination'].mode()[0], inplace=True)
train_df['VIP'].fillna(train_df['VIP'].mode()[0], inplace=True)
train_df['Age'].fillna(train_df['Age'].mode()[0], inplace=True)
train_df['FoodCourt'].fillna(0, inplace=True)
train_df['RoomService'].fillna(0, inplace=True)
train_df['ShoppingMall'].fillna(0, inplace=True)
train_df['Spa'].fillna(0, inplace=True)
train_df['VRDeck'].fillna(0, inplace=True)
train_df['Deck'].fillna(train_df['Deck'].mode()[0], inplace=True)
train_df['Cabin_num'].fillna(train_df['Cabin_num'].mode()[0], inplace=True)
train_df['Side'].fillna(train_df['Side'].mode()[0], inplace=True)
train_df.head()

In [None]:
#特征预处理
train_df['VIP'] = train_df['VIP'].astype(float)
train_df['CryoSleep'] = train_df['CryoSleep'].astype(float)
HomePlanet2num={'Europa':0, 'Earth':1, 'Mars':2}
train_df['HomePlanet_num'] = train_df['HomePlanet'].map(HomePlanet2num)
Destination2num={'TRAPPIST-1e':0,'PSO J318.5-22':1, '55 Cancri e':2}
train_df['Destination_num'] = train_df['Destination'].map(Destination2num)
train_df['Deck_num'] = train_df['Deck'].apply(lambda x: ord(x)-ord('A') if pd.notnull(x) else np.nan)
train_df=train_df.drop(['HomePlanet','Destination','Deck'], axis=1)
train_df.head()

In [None]:
Side2num={'P':0, 'S':1}
train_df['Side'] = train_df['Side'].map(Side2num)
train_df.head()

In [None]:
train_df.drop('CryoSleep', axis=1)[train_df.HomePlanet_num.isnull()].values.shape

In [None]:
#使用随机森林填补强相关性分类变量 HomePlanet_num,CryoSleep
from sklearn.ensemble import RandomForestRegressor

def set_missing_HomePlanet(df):
    # 把完整的数值特征取出来，放入随机森林中进行训练
    HP_df = df.drop('CryoSleep', axis=1)
    # 乘客分成已知和未知两个部分
    known_HP = HP_df[HP_df.HomePlanet_num.notnull()].values
    unknown_HP = HP_df[HP_df.HomePlanet_num.isnull()].values

    #使用随机森林模型来预测缺失值
    rf_model = RandomForestRegressor()
    rf_model.fit(np.concatenate((known_HP[:,:10],known_HP[:,11:]),axis=1), known_HP[:,10])
    predicted_values = rf_model.predict(np.concatenate(( unknown_HP[:,:10], unknown_HP[:,11:]),axis=1))

    # 将预测值填补回原始数据集
    df.loc[df['HomePlanet_num'].isnull(), 'HomePlanet_num'] = predicted_values

    return df
train_df=set_missing_HomePlanet(train_df)

In [None]:
def set_missing_CryoSleep(df):
    # 乘客分成已知和未知两个部分
    known_CS = df[df.CryoSleep.notnull()].values
    unknown_CS = df[df.CryoSleep.isnull()].values

    #使用随机森林模型来预测缺失值
    rf_model = RandomForestRegressor()
    rf_model.fit(known_CS[:,1:], known_CS[:,0])
    predicted_values = rf_model.predict(unknown_CS[:,1:])

    # 将预测值填补回原始数据集
    df.loc[df['CryoSleep'].isnull(), 'CryoSleep'] = predicted_values

    return df
train_df=set_missing_CryoSleep(train_df)

In [None]:
print(train_df.isnull().sum().sort_values(ascending = False) / train_df.shape[0])

In [None]:
#绘制热力图
correlation_matrix = train_df.corr()

# 绘制热力图
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#特征归一化
import pandas as pd
from sklearn.preprocessing import StandardScaler
cols = ['Age', 'RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck','Cabin_num','Deck_num']  
scaler = StandardScaler()
train_df[cols] = scaler.fit_transform(train_df[cols])

train_df.to_csv('processed_train.csv', index=False)

In [None]:
#对test集进行预处理
test_df = test_df.drop(['PassengerId', 'Name'], axis=1)

In [None]:
test_df[["Deck", "Cabin_num", "Side"]] = test_df["Cabin"].str.split("/", expand=True)
test_df = test_df.drop('Cabin', axis=1)
test_df['Destination'].fillna(test_df['Destination'].mode()[0], inplace=True)
test_df['VIP'].fillna(test_df['VIP'].mode()[0], inplace=True)
test_df['Age'].fillna(test_df['Age'].mode()[0], inplace=True)
test_df['FoodCourt'].fillna(0, inplace=True)
test_df['RoomService'].fillna(0, inplace=True)
test_df['ShoppingMall'].fillna(0, inplace=True)
test_df['Spa'].fillna(0, inplace=True)
test_df['VRDeck'].fillna(0, inplace=True)
test_df['Deck'].fillna(test_df['Deck'].mode()[0], inplace=True)
test_df['Cabin_num'].fillna(test_df['Cabin_num'].mode()[0], inplace=True)
test_df['Side'].fillna(test_df['Side'].mode()[0], inplace=True)
test_df.head()

In [None]:
test_df['VIP'] = test_df['VIP'].astype(float)
test_df['CryoSleep'] = test_df['CryoSleep'].astype(float)
HomePlanet2num={'Europa':0, 'Earth':1, 'Mars':2}
test_df['HomePlanet_num'] = test_df['HomePlanet'].map(HomePlanet2num)
Destination2num={'TRAPPIST-1e':0,'PSO J318.5-22':1, '55 Cancri e':2}
test_df['Destination_num'] = test_df['Destination'].map(Destination2num)
test_df['Deck_num'] = test_df['Deck'].apply(lambda x: ord(x)-ord('A') if pd.notnull(x) else np.nan)
test_df=test_df.drop(['HomePlanet','Destination','Deck'], axis=1)
Side2num={'P':0, 'S':1}
test_df['Side'] = test_df['Side'].map(Side2num)
test_df.head()

In [None]:
def set_missing_HomePlanet(df):
    # 把完整的数值特征取出来，放入随机森林中进行训练
    HP_df = df.drop('CryoSleep', axis=1)
    # 乘客分成已知和未知两个部分
    known_HP = HP_df[HP_df.HomePlanet_num.notnull()].values
    unknown_HP = HP_df[HP_df.HomePlanet_num.isnull()].values

    #使用随机森林模型来预测缺失值
    rf_model = RandomForestRegressor()
    rf_model.fit(np.concatenate((known_HP[:,:9],known_HP[:,10:]),axis=1), known_HP[:,9])
    predicted_values = rf_model.predict(np.concatenate(( unknown_HP[:,:9], unknown_HP[:,10:]),axis=1))

    # 将预测值填补回原始数据集
    df.loc[df['HomePlanet_num'].isnull(), 'HomePlanet_num'] = predicted_values

    return df
test_df=set_missing_HomePlanet(test_df)

In [None]:
test_df=set_missing_CryoSleep(test_df)

In [None]:
#特征归一化
import pandas as pd
from sklearn.preprocessing import StandardScaler
cols = ['Age', 'RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck','Cabin_num','Deck_num']  
scaler = StandardScaler()
test_df[cols] = scaler.fit_transform(test_df[cols])
test_df.to_csv('processed_test.csv', index=False)

In [None]:
test_df.head()