https://www.kaggle.com/rafaelvleite/titanic-artificial-neural-network-80-score/code

In [1]:
import numpy  as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import re

### Load data

In [2]:
data_train = pd.read_csv("data/train.csv")
data_test  = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/gender_submission.csv")

data = [data_train, data_test]
data = pd.concat(data, ignore_index=True)

data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


### Complete missing data

In [3]:
data['Age']      = data['Age'].replace(np.nan, data['Age'].mean(), regex=True)
data['Fare']     = data['Fare'].replace(np.nan, data['Fare'].mean(), regex=True)
data['Embarked'] = data['Embarked'].replace(np.nan, "M", regex=True)

### Add new data fields

In [4]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

data['IsAlone'] = 1 #initialize to yes/1 is alone
data.loc[data['FamilySize']>1, 'IsAlone'] = 0

# Get title after the comma and befire the dot
data['Title'] = data['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
Title_Dictionary = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "the Countess":"Royalty",
    "Lady" :      "Royalty",
    "Dona":       "Royalty",
    "Mr" :        "Mr",
    "Mme":        "Mrs",
    "Ms":         "Mrs",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Mlle":       "Miss",
    "Master" :    "Master",
}
data['Title'] = data.Title.map(Title_Dictionary)

data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,IsAlone,Title
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,2,0,Mr
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,2,0,Mrs
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,1,1,Miss
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,2,0,Mrs
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,1,1,Mr


### Remove some data fields

In [5]:
drop_columns = ['PassengerId', 'Name', 'Cabin', 'Ticket']
data.drop(drop_columns, axis=1, inplace = True)
    
data.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,FamilySize,IsAlone,Title
0,22.0,S,7.25,0,3,male,1,0.0,2,0,Mr
1,38.0,C,71.2833,0,1,female,1,1.0,2,0,Mrs
2,26.0,S,7.925,0,3,female,0,1.0,1,1,Miss
3,35.0,S,53.1,0,1,female,1,1.0,2,0,Mrs
4,35.0,S,8.05,0,3,male,0,0.0,1,1,Mr


### One hot encoding

In [12]:
onehot_fields      = ['Sex', 'Embarked', 'IsAlone', 'Title'] #'Pclass'
normalize_fields   = ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch', 'FamilySize']
target_fields      = ['Survived']

data_onehot    = pd.get_dummies(data[onehot_fields])
data_onehot.head()

Unnamed: 0,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_M,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,0,0,1,0,0,0,1,0,0,1,0,0,0
1,0,1,0,1,0,0,0,0,0,0,1,0,0
2,1,1,0,0,0,0,1,0,1,0,0,0,0
3,0,1,0,0,0,0,1,0,0,0,1,0,0
4,1,0,1,0,0,0,1,0,0,1,0,0,0


### Normalization

In [13]:
# Create x, where x the 'scores' column's values as floats
x = data[normalize_fields].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()
std_scaler     = preprocessing.StandardScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
data_normalized = pd.DataFrame(x_scaled)

data_normalized.head()

Unnamed: 0,0,1,2,3,4,5
0,0.273456,0.014151,1.0,0.125,0.0,0.1
1,0.473882,0.139136,0.0,0.125,0.0,0.1
2,0.323563,0.015469,1.0,0.0,0.0,0.0
3,0.436302,0.103644,0.0,0.125,0.0,0.1
4,0.436302,0.015713,1.0,0.0,0.0,0.0


In [22]:
input_data = pd.concat([data_onehot, data_normalized], axis=1, ignore_index=True)

# Splitting the dataset into Train and Test
train_x = input_data.iloc[:891, :]
test_x  = input_data.iloc[891:, :]

train_y = data_train[target_fields].values.ravel()

input_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0,0,1,0,0,0,1,0,0,1,0,0,0,0.273456,0.014151,1.0,0.125,0.0,0.1
1,0,1,0,1,0,0,0,0,0,0,1,0,0,0.473882,0.139136,0.0,0.125,0.0,0.1
2,1,1,0,0,0,0,1,0,1,0,0,0,0,0.323563,0.015469,1.0,0.0,0.0,0.0
3,0,1,0,0,0,0,1,0,0,0,1,0,0,0.436302,0.103644,0.0,0.125,0.0,0.1
4,1,0,1,0,0,0,1,0,0,1,0,0,0,0.436302,0.015713,1.0,0.0,0.0,0.0


### Model

In [23]:
model = MLPClassifier(activation = 'relu', solver='lbfgs', hidden_layer_sizes=(150), random_state=10)
model.fit(train_x, train_y)
pred = model.predict(train_x)

accuracy_score(pred, train_y)

0.8664421997755332

## Submission

In [24]:
test_pred = model.predict(test_x)

sub = pd.DataFrame({"PassengerId": data_test["PassengerId"], "Survived": test_pred})
sub.to_csv("submission.csv", index=False)

---

In [9]:
raw_data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, 2, 3, 1, 2],
        'treatment': [0, 1, 0, 1, 0],
        'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])
df

Unnamed: 0,patient,obs,treatment,score
0,1,1,0,strong
1,1,2,1,weak
2,1,3,0,normal
3,2,1,1,weak
4,2,2,0,strong


In [12]:
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()

# Fit the encoder to the pandas column
le.fit(df['score'])

le.transform(df['score']) 

array([1, 2, 0, 2, 1])