In [1]:
import numpy as np 
import pandas as pd 

In [2]:
train = pd.read_csv('../input/titanic/train.csv').set_index("PassengerId")

In [None]:
train[-10:]

In [None]:
# We will star by creating the dataset 

X_train =  train.drop(["Survived"], axis=1)
y_train = train["Survived"].astype(int)

In [None]:
# We will star by  using the Logistic regression model 
from sklearn.linear_model import LogisticRegression 

# We have some categorical data we need to 
X_train = X_train.drop(['Name','Sex', 'Ticket','Cabin',"Embarked"], axis = 1)
X_train = X_train.fillna(0) # We fill the NaN


In [None]:
# We initiate the model 
model = LogisticRegression ()
model.fit(X_train, y_train)

In [None]:
# Now we wil evaluate 
from sklearn.metrics import accuracy_score

In [None]:
y_pred = model.predict(X_train)
accuracy_score(y_train, y_pred)

In [None]:
# We will use another model to improve the results 

from sklearn.tree import DecisionTreeClassifier 

model_1 = DecisionTreeClassifier()
model_1.fit(X_train, y_train)
y_pred = model_1.predict(X_train)
accuracy_score(y_train, y_pred)


In [None]:
# Now we will make a prediction for the test data 
# First we will shape the test data as the train

X_test = pd.read_csv("../input/titanic/test.csv").set_index("PassengerId")

X_test = X_test.drop(['Name','Sex', 'Ticket','Cabin',"Embarked"], axis = 1)
X_test = X_test.fillna(0) # We fill the NaN


In [None]:
# Now it's time to predict with model_1 on the test data 

y_pred_test = model_1.predict(X_test)


In [None]:
# We will create a function to make the csv file we need 

def make_submission(X_test, y_pred_test, title):
    submission = pd.DataFrame({
        "PassengerId" : X_test.index, 
        "Survived" : y_pred_test
    })
    submission.to_csv(title, index=False)

In [None]:
make_submission(X_test, y_pred_test, 'submission.csv')

The prediction we made was not good so we will try to improve by using the categorical data 


In [3]:
# We will star by creating the dataset 

X_train =  train.drop(["Survived"], axis=1)
y_train = train["Survived"].astype(int)

In [None]:
X_train 

We also need to check the test data so we can create a model that can be suited for both 

In [4]:
test = pd.read_csv('../input/titanic/test.csv').set_index("PassengerId")
train = pd.read_csv('../input/titanic/train.csv').set_index("PassengerId")

In [5]:
# First we can see the Name, Ticket No are not useful so we will drop this as well as the Cabin since a lot of data is missing 

X_train = train.drop(['Name','Ticket', 'Cabin','Survived'], axis=1)
y_train = (train['Survived'])
X_test = test.drop(['Name','Ticket', 'Cabin'], axis=1)


In [None]:
# Now we will fill the data missing, first we will check where we have data missing 

import missingno as msng 

msng.matrix(X_train)

In [6]:
# We need to fill the Age and Embarked columns. 

fill_age = X_train['Age'].mean()

X_train["Age"] = X_train['Age'].fillna(value= fill_age) # We fill the NaN
X_train['Embarked'] = X_train['Embarked'].fillna(0)

In [None]:
msng.matrix(X_train)

In [7]:
# We will check the unique values on the columns 
for col in X_train:
    print(X_train[col].unique())

In [8]:
# We observe the Embarked column is not correctly filled since we have a 0
# We will use the S value since the S is the most common value 
X_train['Embarked']=X_train['Embarked'].replace(0,"S")



In [9]:
# We review the problem is solved 
for col in X_train:
    print(X_train[col].unique())

In [None]:
# We will do the same thing for the test data 

msng.matrix(X_test)

In [10]:
# It seems our data can be filled easily since the missing data is all discrete type value 
fill_testa = X_test['Age'].mean()
fill_testf = X_test['Fare'].mean()
X_test ["Age"] = X_test["Age"].fillna(value = fill_testa) # We fill the NaN
X_test ['Fare'] = X_test['Fare'].fillna(value = fill_testf)

In [11]:
# We check the unique values 

for col in X_test:
    print(X_test[col].unique())

In [12]:
# We can se the ranges of SibsP goes from [0-8] on both cases 
# however Parch have different range values [0-6] and [0-9]

X_test['Parch'][X_test['Parch'] == 9]

In [13]:
# We are planning to use one hot encoding so we need to move this rows and recreate a result

X_test_toadd = X_test.loc[[1234,1257] , :]

In [14]:
# We will add the rows to the end of the X_train data 
X_train = X_train.append(X_test_toadd, ignore_index = True)

In [15]:
# We check the data at the end is added 
X_train

In [16]:
# Now we will  provide the result of this two rows at the end of y_train 
to_add =[0,0]
y_train = np.append(y_train, to_add)

In [28]:
# We check the values are inside the vector correctly 
len(y_train)

In [18]:
# Now it's time to 
for col in X_train:
    print(X_train[col].unique())

In [20]:
# Now that the train data have the existance of the 9 Parch we will onehot encode 
# We will normalize as well 

# We will borrow some classes from Scikit-Learn 
# to normalize the data and use onehot encoder for the categorical classes
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Create a column transformer to use the MinMaxScales and OnehotEncoder
ct= make_column_transformer(
    (MinMaxScaler(), ['Age', 'Fare']), #turn all values in these columns between 0 and 1
    (OneHotEncoder(handle_unknown="ignore"), ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'])
)



In [21]:
# We now fit the data 
ct.fit(X_train)

#We finally transform the X_train 
X_train_normal = ct.transform(X_train)

In [29]:
# We need to convert the train data to pd.DataFrame again 
X_train_normal = X_train_normal.toarray()
X_train_normal = pd.DataFrame(X_train_normal)

In [23]:
from sklearn.metrics import accuracy_score

In [25]:
# Now it's time to do it with the normalized test data 
# 
ct.fit(X_test)

#We finally transform the X_train 
X_test_normal = ct.transform(X_test)

In [26]:
# We need to convert the train data to pd.DataFrame again 
X_test_normal = X_test_normal.toarray()
X_test_normal = pd.DataFrame(X_test_normal)

In [34]:
len(X_train_normal), len(y_train)

In [35]:
# We will use rainforest classifier 
from sklearn.ensemble import RandomForestClassifier 

model_1 = RandomForestClassifier()
model_1.fit(X_train_normal, y_train)
y_pred = model_1.predict(X_train_normal)
accuracy_score(y_train, y_pred)

In [37]:
y_preds = model_1.predict(X_test_normal)

In [38]:
def make_submission(X_test_normal, y_preds, title):
    submission = pd.DataFrame({
        "PassengerId" : X_test.index, 
        "Survived" : y_preds
    })
    submission.to_csv(title, index=False)

In [40]:
make_submission(X_test_normal, y_preds, 'submission.csv')

With these result we improved a little obtaining 0.73684  now we will try with another model 


In [41]:
# We will use neural network with binary classifier 
import tensorflow as tf 
# Lets recreate a model to fit on the training data and evaluate on the train test 

# Create the seed 
tf.random.set_seed(42)

#Create the model (same as last model)

#1. Create a model with a non-linear activation 
model_2 = tf.keras.Sequential([
                                tf.keras.layers.Dense(7, activation=tf.keras.activations.relu),
                                tf.keras.layers.Dense(5, activation=tf.keras.activations.relu),
                                tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid)
])

#2. Compile the model 
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(lr=0.01),
                metrics=["Accuracy"])

#3. Fit the model 
model_2.fit(X_train_normal,y_train, epochs=100)

In [45]:
y_pred = model_2.predict(X_test_normal)
y_pred = tf.round(y_pred)
y_pred = tf.squeeze(y_pred)

In [None]:
make_submission(X_test_normal, y_preds, 'submission.csv')

We've got a lower score of 0.55263

In [46]:
import catboost as cb 
model_3 = cb.CatBoostClassifier()
model_3.fit(X_train_normal, y_train)
y_pred = model_3.predict(X_train_normal)
accuracy_score(y_train, y_pred)

In [47]:
y_preds = model_3.predict(X_test_normal)

In [48]:
y_preds

In [49]:
make_submission(X_test_normal, y_preds, 'submission.csv')

The third model scored 0.77511 using bootcast 