In [1]:
import pandas as pd
import numpy as np

# Importing the data

In [2]:
df_train_1 = pd.read_csv('./dataset/train.csv')
df_train_1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# # Task 1: Cleaning

### Drop unnecessary columns and NAN rows

In [3]:
df_train_2 = df_train_1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df_train_2.head()
df_train_2['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [4]:
mean_age = df_train_2['Age'].mean()

df_train_3 = df_train_2.copy()

print(df_train_3.count()['Age'])
df_train_3['Age'] = df_train_3['Age'].replace(np.nan, mean_age)
df_train_3.count()['Age']

714


891

In [5]:
df_train_3[df_train_3['Embarked'].isnull()]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,1,1,female,38.0,0,0,80.0,
829,1,1,female,62.0,0,0,80.0,


In [6]:
df_train_2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
np.count_nonzero(df_train_3.isnull())

2

#### We have just 2 NAN rows, we can drop it safely

In [8]:
df_train_3.dropna(inplace=True)
np.count_nonzero(df_train_3.isnull())

0

### Convert Strings

#### Male and Female to 0 and 1

In [9]:
df_train_4 = df_train_3.copy()
df_train_4['Sex'] = df_train_4['Sex'].replace({'male': 0, 'female': 1}).to_numpy()

df_train_4.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


#### Pclass and Embarked aren't numeric values, we can't convert to 0, 1 and 2 because it would be treated as numeric by our model. For example, the age is a numeric value, it's important for our model know that a 70 y.o person is older than a 17 y.o person, or that a 10k fare is greater than a 5k fare. Therefore, if we treat embarked as 0, 1 and 2 the model will consider that 2 embark is greater than the 0, but its just classes just like the Pclass. To solve it, we can use arrays, for instance in the class we can use [1, 0, 0] for class 1, [0, 1, 0] for class 2 and [0, 0, 1] for class 3

In [10]:
def pclass_to_array(Pclass):
    cls_arr = np.zeros(3)
    
    try:
        cls_arr[Pclass - 1] = 1
        return cls_arr
    except:
        return None

def embarked_to_array(embarked):
    cls_arr = np.zeros(3).astype(np.int64)
    
    if str(embarked) in 'CQS':
        cls_arr = np.zeros(3)
        if embarked == 'C':
            cls_arr[0] = 1  
        elif embarked == 'Q':
            cls_arr[1] = 1
        elif embarked == 'S':
            cls_arr[2] = 1
        return cls_arr
    else:
        return None

In [11]:
df_train_5 = df_train_4.copy()

df_train_5['Pclass'] = df_train_5['Pclass'].apply(lambda pclass : pclass_to_array(pclass))
df_train_5['Embarked'] = df_train_5['Embarked'].apply(lambda embarked : embarked_to_array(embarked))

df_train_5.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,"[0.0, 0.0, 1.0]",0,22.0,1,0,7.25,"[0.0, 0.0, 1.0]"
1,1,"[1.0, 0.0, 0.0]",1,38.0,1,0,71.2833,"[1.0, 0.0, 0.0]"
2,1,"[0.0, 0.0, 1.0]",1,26.0,0,0,7.925,"[0.0, 0.0, 1.0]"
3,1,"[1.0, 0.0, 0.0]",1,35.0,1,0,53.1,"[0.0, 0.0, 1.0]"
4,0,"[0.0, 0.0, 1.0]",0,35.0,0,0,8.05,"[0.0, 0.0, 1.0]"


In [12]:
df_train_5.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

# # Task 2: Building Classification Models

### Get the train data

In [13]:
from sklearn.model_selection import train_test_split

pclass = np.array(list(df_train_5['Pclass']))
embarked = np.array(list(df_train_5['Embarked']))
sex = np.array(list(df_train_5['Sex'])).reshape(-1,1)
age = np.array(list(df_train_5['Age'])).reshape(-1,1)
sibsp = np.array(list(df_train_5['SibSp'])).reshape(-1,1)
parch = np.array(list(df_train_5['Parch'])).reshape(-1,1)
fare = np.array(list(df_train_5['Fare'])).reshape(-1,1)


X = np.concatenate((pclass, sex, age, sibsp, parch, fare, embarked), axis=1)
y = df_train_5['Survived'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Building models

#### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter=300)

clf_log.fit(X_train, y_train)

clf_log.score(X_test, y_test)

print(clf_log.coef_)
df_train_5.columns

[[ 9.88715668e-01  1.39141220e-01 -1.12636017e+00  2.61249059e+00
  -4.24386897e-02 -3.75657339e-01 -6.56381493e-02  2.49559347e-03
   2.06835024e-01  1.30476080e-01 -3.35814390e-01]]


Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

# # Task 3: Evaluation

### Prepare the test data

In [61]:
df_test = pd.read_csv('./dataset/test.csv')
print(df_test.describe())

       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000
25%     996.250000    1.000000   21.000000    0.000000    0.000000    7.895800
50%    1100.500000    3.000000   27.000000    0.000000    0.000000   14.454200
75%    1204.750000    3.000000   39.000000    1.000000    0.000000   31.500000
max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200


In [54]:
df_test_1 = df_test.drop(columns=['Name', 'Ticket', 'Cabin'])
mean_age = df_test['Age'].mean()
mean_fare = df_test['Fare'].mean()

df_test_1['Fare'] = df_test_1['Fare'].replace(np.nan, mean_fare)
df_test_1['Age'] = df_test_1['Age'].replace(np.nan, mean_age)
df_test_1['Age'].count()

418

In [55]:
def pclass_to_array(Pclass):
    cls_arr = np.zeros(3)
    
    try:
        cls_arr[Pclass - 1] = 1
        return cls_arr
    except:
        return None

def embarked_to_array(embarked):
    cls_arr = np.zeros(3).astype(np.int64)
    
    if str(embarked) in 'CQS':
        cls_arr = np.zeros(3)
        if embarked == 'C':
            cls_arr[0] = 1  
        elif embarked == 'Q':
            cls_arr[1] = 1
        elif embarked == 'S':
            cls_arr[2] = 1
        return cls_arr
    else:
        return None

In [56]:
df_test_2 = df_test_1.copy()
df_test_2['Sex'] = df_test_2['Sex'].replace({'male': 0, 'female': 1}).to_numpy()
df_test_2['Pclass'] = df_test_2['Pclass'].apply(lambda pclass: pclass_to_array(pclass))
df_test_2['Embarked'] = df_test_2['Embarked'].apply(lambda embarked: embarked_to_array(embarked))

df_test_2.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,"[0.0, 0.0, 1.0]",0,34.5,0,0,7.8292,"[0.0, 1.0, 0.0]"
1,893,"[0.0, 0.0, 1.0]",1,47.0,1,0,7.0,"[0.0, 0.0, 1.0]"
2,894,"[0.0, 1.0, 0.0]",0,62.0,0,0,9.6875,"[0.0, 1.0, 0.0]"
3,895,"[0.0, 0.0, 1.0]",0,27.0,0,0,8.6625,"[0.0, 0.0, 1.0]"
4,896,"[0.0, 0.0, 1.0]",1,22.0,1,1,12.2875,"[0.0, 0.0, 1.0]"


In [57]:
df_test_2.count()

PassengerId    418
Pclass         418
Sex            418
Age            418
SibSp          418
Parch          418
Fare           418
Embarked       418
dtype: int64

In [58]:
passenger_id = np.array(list(df_test_2['PassengerId']))

pclass = np.array(list(df_test_2['Pclass']))
embarked = np.array(list(df_test_2['Embarked']))
sex = np.array(list(df_test_2['Sex'])).reshape(-1,1)
age = np.array(list(df_test_2['Age'])).reshape(-1,1)
sibsp = np.array(list(df_test_2['SibSp'])).reshape(-1,1)
parch = np.array(list(df_test_2['Parch'])).reshape(-1,1)
fare = np.array(list(df_test_2['Fare'])).reshape(-1,1)

test_X = np.concatenate((pclass, sex, age, sibsp, parch, fare, embarked), axis=1)

survived = clf_log.predict(test_X)

prediction = pd.DataFrame(data={'PassengerId': passenger_id, 'Survived': survived})

prediction.to_csv('prediction.csv', index=False)