# 👻 Goal: work with the Titanic data, 
## Build a model and submit it to Kaggle

### Load the packages

In [1]:
# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# machine learning stack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

### Load the data

In [2]:
titanic = pd.read_csv("/Users/wukaiti/Spiced/my_encounter_notes/week_02/Feature_Engineering_Intro/train_titanic.csv")

### Preprocessing the data

In [3]:
# takes the sring after "," position indicated as "1" and before ".", position indicated as "0"
titanic["Name"].apply(lambda x: x.split(',')[1].split('.')[0].lower().strip()).unique() 

array(['mr', 'mrs', 'miss', 'master', 'don', 'rev', 'dr', 'mme', 'ms',
       'major', 'lady', 'sir', 'mlle', 'col', 'capt', 'the countess',
       'jonkheer'], dtype=object)

In [4]:
# hint:
titanic.groupby(['Pclass','Sex'])['Age'].mean()

Pclass  Sex   
1       female    34.611765
        male      41.281386
2       female    28.722973
        male      30.740707
3       female    21.750000
        male      26.507589
Name: Age, dtype: float64

In [13]:
titanic.title.unique()

array(['mr', 'mrs', 'miss', 'master', 'don', 'rev', 'dr', 'mme', 'ms',
       'major', 'lady', 'sir', 'mlle', 'col', 'capt', 'the countess',
       'jonkheer'], dtype=object)

In [15]:
# takes the sring after "," position indicated as "1" and before ".", position indicated as "0"
titanic["title"] = titanic["Name"].apply(lambda x: x.split(',')[1].split('.')[0].lower().strip())
#3.2 binning: handling of rare titles**
# hint
# 1. find the list of unique titles
# 2. write a function that does the following transformations:
## ['mrs','mr','miss','master','dr','rev'] remain the same
## ['mlle','ms'] become 'miss'
## 'mme' becomes 'mrs'
## ['col','major','capt'] become 'army'
## ['don','lady','the countess','sir','the count','madam','lord'] become 'nobl'
## other titles become 'unknown'
# 3. use .apply() methif for binning the ttile column


title_dict = {
    'mrs': 'mrs',
    'mr': 'mr',
    'miss': 'miss',
    'master': 'master',
    'dr': 'dr',
    'rev': 'rev',
    'mlle': 'miss',
    'ms': 'miss',
    'mme': 'mrs',
    'col': 'army',
    'major': 'army',
    'capt': 'army',
    'don': 'nobl',
    'lady': 'nobl',
    'the countess': 'nobl',
    'jonkheer':'nobl',
    'sir': 'nobl',
    'the count': 'nobl',
    'madam': 'nobl',
    'lord': 'nobl'
}
titanic['title_class'] = titanic['title'].apply(lambda x: title_dict.get(x, 'unknown'))
#3.3 imputation of age**
# hint:
titanic.groupby(['Pclass','Sex'])['Age'].mean()
#3.4 imputation of embarkation**

titanic["Embarked"] = titanic["Embarked"].fillna("S")
#3.5 imputation of cabin**
titanic["Cabin"] = titanic["Cabin"].fillna("missing cabin")
#3.6 engineer fare price**
# hint
titanic["Fare"] = titanic["Fare"].fillna(f"apply(lambda x: x['Fare']/(x['SibSp']+x['Parch']),axis=1")

In [16]:
# data analysis stack
import numpy as np
import pandas as pd

# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# miscellaneous
import warnings
warnings.filterwarnings("ignore")
#**read data**
df = pd.read_csv('/Users/wukaiti/Spiced/my_encounter_notes/week_02/Feature_Engineering_Intro/train_titanic.csv')
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
### 2.1 Train-Test split
train,test = train_test_split(df, test_size=0.2, random_state=101)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)


In [18]:
### 2.2 Quick exploration
train.head()
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          577 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        157 non-null    object 
 11  Embarked     710 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.9+ KB


In [None]:
#EDA first

Data Dictionary 

Variable	Definition	Key

survival	Survival	0 = No, 1 = Yes

pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd

sex	Sex	

Age	Age in years	

sibsp	# of siblings / spouses aboard the Titanic	

parch	# of parents / children aboard the Titanic	

ticket	Ticket number	

fare	Passenger fare	

cabin	Cabin number	

embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

In [28]:
women = titanic.loc[titanic.Sex == 'female']['Survived']
rate_women = sum(women)/len(women)
print('% of women survived',rate_women)

% of women survived 0.7420382165605095


In [31]:
men = titanic.loc[titanic.Sex == 'male']['Survived']
rate_men = sum(men)/len(men)
print('% of men survived',rate_men)

% of men survived 0.18890814558058924


In [39]:
features = ['Pclass', 'Sex','Parch']
X = pd.get_dummies(titanic[features])
X_test = pd.get_dummies(titanic[features])
y = titanic['Survived']

In [40]:
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestClassifier(n_estimators = 100 , max_depth = 7 ,random_state = 42)

In [41]:
rfr

RandomForestClassifier(max_depth=7, random_state=42)

In [45]:
rfr.fit(X ,y)
predictions = rfr.predict(X_test)


In [46]:
result = pd.DataFrame({'PassengerId': titanic['PassengerId'],'Survived': predictions})

In [47]:
result.to_csv("results.csv")

Unnamed: 0,PassengerId,Survived
0,1,0
1,2,1
2,3,1
3,4,1
4,5,0
...,...,...
886,887,0
887,888,1
888,889,0
889,890,0
