# Import test dataset

In [23]:
import numpy as np
import pandas as pd

In [24]:
test_data = pd.read_csv('test.csv')

In [25]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [26]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


### Preprocessed data as the training one

In [27]:
data = test_data.drop(['Cabin','PassengerId', 'Name', 'Ticket'], axis=1)

In [28]:
from sklearn.impute import SimpleImputer

impute_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
impute_Age = impute_most_frequent.fit_transform(data[['Age']])
data['Age'] = impute_Age

In [29]:
impute_Fare = impute_most_frequent.fit_transform(data[['Fare']])
data['Fare'] = impute_Fare

In [30]:
X = data

In [31]:
X['Age'] = pd.cut(data['Age'], bins=[0., 12, np.inf], labels=['Children', 'Adult'], right=True)

In [32]:
X['SibSp'] =  data['SibSp'].clip(0, 1)
X['Parch'] = data['Parch'].clip(0,1)

In [33]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,Adult,0,0,7.8292,Q
1,3,female,Adult,1,0,7.0000,S
2,2,male,Adult,0,0,9.6875,Q
3,3,male,Adult,0,0,8.6625,S
4,3,female,Adult,1,1,12.2875,S
...,...,...,...,...,...,...,...
413,3,male,Adult,0,0,8.0500,S
414,1,female,Adult,0,0,108.9000,C
415,3,male,Adult,0,0,7.2500,S
416,3,male,Adult,0,0,8.0500,S


In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

column_transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 2, 6])],
                                       remainder='passthrough')

X = column_transformer.fit_transform(X)

In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Import Model

In [37]:
import joblib

model = joblib.load('Titanic_model.pkl')

In [38]:
predictions = model.predict(X_scaled)

In [39]:
predictions

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [49]:
pd.DataFrame(zip(test_data['PassengerId'], predictions)).describe()

Unnamed: 0,0,1
count,418.0,418.0
mean,1100.5,0.315789
std,120.810458,0.465387
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


In [51]:
import csv

with open('Predictions.csv', 'w', newline='') as csvfile: #crea y escribe un archivo csv
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['PassengerId', 'Survived'])
    for i  in zip(test_data['PassengerId'], predictions):
        writer.writerow(i)