In [None]:
#Titanic Kaggle competition
from sklearn import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Loading the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submit = pd.read_csv("gender_submission.csv")
# Observing the data 
train.info() 
test.info()

In [None]:
#由於要對整體資料做⼀些觀察，所以先合併資料。因為合併後index重複，因此將index重新設定
data = train.append(test)
data
data.reset_index(inplace=True, drop=True)

In [None]:
# %matplotlib inline
sns.countplot(data['Survived']) #觀察兩類別的比例是否差別很⼤
sns.countplot(data['Pclass'],hue=data ['Survived']) #觀察船票等級和⽣存的關係
plt.show()

In [None]:
#觀察年齡和⽣存的關係
g = sns.FacetGrid(data,col='Survived') 
g.map(sns.distplot,'Age',kde=False)
plt.show()

In [None]:
#定義新的 feature: Family_Size = Parch + SibSp
data['Family_Size']=data['Parch']+data['SibSp']
g = sns.FacetGrid(data,col='Survived')
g.map(sns.distplot,'Family_Size',kde=False)
plt.show()

In [None]:
#特徵工程「 姓名 」不能直接拿來預測，但其中的「 稱謂 」可能會跟是否⽣存有關
data['Title1'] = data['Name'].str.split(", ", expand=True) [1]
data['Title1'].head(3)

data['Title1'] = data ['Title1'].str.split(".",expand=True)[0]
data['Title1'].head(3)

data['Title1'].unique()

In [None]:
#特徵⼯程 將稱謂與其他特徵作分析
pd.crosstab(data['Title1'],data['Sex']).T.style.background_gradient(cmap='summer_r')

In [None]:
#某些稱謂的乘客很少，所以合併其中的某些稱謂
data['Title2'] = data ['Title1']. replace( ['Mlle', 'Mme','Ms','Dr', 'Major','Lady','the Countess', 'Jonkheer', 'Col', 'Rev', 'Capt', 'Sir', 'Don', 'Dona'],
['Miss', 'Mrs', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs','Mr','Mr', 'Mr', 'Mr', 'Mr','Mr', 'Mrs'])
data['Title2'].unique()

In [None]:
#將票號資訊取出英⽂代碼(房間位置)的部分，省略後⾯的號碼，如果只有號碼的票號⽤Ｘ表⽰
data['Ticket_info'] = data['Ticket'].apply(lambda x :
                                           x.replace(".","").replace("/","").strip().split(' ')[0] if not x.isdigit() else 'X')
data['Ticket_info'].unique()
sns.countplot(data['Ticket_info'], hue=data['Survived'])

In [None]:
#處理遺失值
#登船港⼝（Embarked）只遺漏少數，補次數最多的“Ｓ”
#費⽤（Fare) 也只少⼀筆，直接補上平均值
#觀察艙等（Cabin) 的資料後，只取出最前⾯的英⽂字⺟，剩下的⽤NoCabin來表⽰
data ['Embarked'] = data['Embarked'].fillna('S')
data['Fare'] = data['Fare']. fillna(data['Fare'].mean( ))
data['Cabin'].head(10)
data ["Cabin"] = data['Cabin'].apply(lambda x: str(x)[0] if not pd.isnull(x) else 'NoCabin')
data["Cabin"].unique()
sns.countplot(data['Cabin'], hue=data['Survived'] )
data.info()

In [None]:
#將類別資料轉成整數
data['Sex'] = data['Sex'].astype('category').cat.codes
data ['Embarked'] = data['Embarked'].astype('category').cat.codes
data['Pclass'] = data['Pclass'].astype('category').cat.codes
data['Title1'] = data['Title1'] .astype('category').cat.codes
data['Title2'] = data['Title2' ].astype('category').cat.codes
data['Cabin'] = data['Cabin'].astype('category') .cat.codes
data['Ticket_info'] = data['Ticket_info'].astype('category').cat.codes

In [None]:
#利⽤隨機森林來推測年齡
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
dataAgeNull = data[data['Age'].isnull()]
dataAgeNotNull = data[data["Age"].notnull()]
remove_outlier = dataAgeNotNull[(np.abs(dataAgeNotNull["Fare"]-
                                        dataAgeNotNull["Fare"].mean())>(4*dataAgeNotNull["Fare"].std()))|
                                (np.abs(dataAgeNotNull["Family_Size"]-
                                         dataAgeNotNull["Family_Size"].mean())>(4*dataAgeNotNull["Family_Size"].std()))
                               ]
rfModel_age = RandomForestRegressor(n_estimators=2000, random_state=42)
ageColumns = ['Embarked', 'Fare', 'Pclass', 'Sex', 'Family_Size', 'Title1','Title2', 'Cabin', 'Ticket_info']
rfModel_age.fit(remove_outlier[ageColumns], remove_outlier["Age"])

ageNullValues = rfModel_age.predict(X=dataAgeNull[ageColumns])
dataAgeNull.loc[:,"Age"] = ageNullValues
data = dataAgeNull.append(dataAgeNotNull)
data.reset_index(inplace=True, drop=True)

In [None]:
#留下需要的特徵
dataTrain = data[pd.notnull(data['Survived'])].sort_values(by=["PassengerId"])
dataTest = data[~pd.notnull(data['Survived'])].sort_values(by=["PassengerId"])
dataTrain.columns
dataTrain = dataTrain[['Survived', 'Age', 'Embarked', 'Fare', 'Pclass', 'Sex','Family_Size', 'Title2', 'Ticket_info','Cabin']]
dataTest = dataTest[['Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'Family_Size','Title2', 'Ticket_info', 'Cabin']]
dataTrain

In [None]:
#利⽤隨機森林來預測存活率
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='gini'
                            ,n_estimators=1000
                            ,min_samples_split=12
                            ,min_samples_leaf=1
                            ,oob_score=True
                            ,random_state=1
                            ,n_jobs=-1)
rf.fit(dataTrain.iloc[:,1:],dataTrain.iloc [:,0])
print("%.4f"&rf.oob_score_)

In [None]:
#產⽣ Submit 檔
pd.concat((pd.DataFrame(dataTrain.iloc[:,1:].columns,columns = ['variable']),
           pd.DataFrame(rf.feature_importances_,columns = ['importance'])),
          axis = 1).sort_values (by='importance', ascending = False) [:20]
rf_res = rf.predict(dataTest)
submit ['Survived'] = rf_res
submit ['Survived'] = submit['Survived'].astype(int)
submit.to_csv('submit.csv',index=False)