In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [3]:
#buat satu set untuk memproses semua data sekaligus
train['set'], test['set'] = 'train', 'test'
combined = pd.concat([train, test])

**MISSING VALUE**

In [4]:
#mengisi missing value dan menghilangkan kolom yang tidak digunakan

In [5]:
combined.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
set               0
dtype: int64

In [6]:
pclass = combined.loc[combined.Fare.isnull(), 'Pclass'].values[0]
median_fare = combined.loc[combined.Pclass== pclass, 'Fare'].median()
combined.loc[combined.Fare.isnull(), 'Fare'] = median_fare

In [7]:
#MISSING AGE
combined['Title'] = combined['Name'].str.extract('([A-Za-z]+)\.', expand=True)
combined['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [8]:
title_reduction = {'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 
                   'Master': 'Master', 'Don': 'Mr', 'Rev': 'Rev',
                   'Dr': 'Dr', 'Mme': 'Miss', 'Ms': 'Miss',
                   'Major': 'Mr', 'Lady': 'Mrs', 'Sir': 'Mr',
                   'Mlle': 'Miss', 'Col': 'Mr', 'Capt': 'Mr',
                   'Countess': 'Mrs','Jonkheer': 'Mr',
                   'Dona': 'Mrs'}
combined['Title'] = combined['Title'].map(title_reduction)
combined['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Rev', 'Dr'], dtype=object)

In [9]:
for title, age in combined.groupby('Title')['Age'].median().iteritems():
    print(title, age)
    combined.loc[(combined['Title']==title) & (combined['Age'].isnull()), 'Age'] = age

Dr 49.0
Master 4.0
Miss 22.0
Mr 30.0
Mrs 36.0
Rev 41.5


In [10]:
combined.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Cabin          1014
Embarked          2
set               0
Title             0
dtype: int64

In [11]:
def other_family_members_survived(dataset, label='family_survival'):
    """
    Periksa apakah anggota keluarga lainnya selamat


      -> 0 tidak ada yang selamat
      -> 1 setidaknya satu anggota keluarga lainnya selamat
      -> 0.5 tidak diketahui apakah anggota lain selamat atau seseorang sendirian
    
    """
    ds = dataset.copy()
    if len(dataset) == 1:
        ds[label] = 0.5
        return ds
    result = []
    for ix, row in dataset.iterrows():
        survived_fraction = dataset.drop(ix)['Survived'].mean()
        if np.isnan(survived_fraction):
            result.append(0.5)
        elif survived_fraction == 0:
            result.append(0)
        else:
            result.append(1)
    ds[label] = result
    return ds

In [12]:
combined['surname'] = combined['Name'].apply(lambda x: x.split(",")[0])
combined = combined.groupby(['surname', 'Fare']).apply(other_family_members_survived).reset_index(drop=True)

In [13]:
#Data keluarga yang hilang juga dapat diambil dari Tiket. Pemesanan tiket yang sama memiliki nomor tiket yang sama.
combined = combined.groupby(['Ticket']).apply(lambda x: other_family_members_survived(x, label='family_survival_ticket')).reset_index(drop=True)
combined.loc[combined['family_survival'] == 0.5, 'family_survival'] = combined.loc[combined['family_survival'] == 0.5, 'family_survival_ticket']

In [14]:
#Dapatkan ukuran keluarga dari Parch dan Sibsp

combined['family_size'] = combined['Parch'] + combined['SibSp']

In [15]:
#Convert feature to number
combined['Sex'] = LabelEncoder().fit_transform(combined['Sex'])

In [16]:
combined.loc[:, 'Age'] = pd.qcut(combined['Age'], 4, labels=False)
combined.loc[:, 'Fare'] = pd.qcut(combined['Fare'], 5, labels=False)

In [17]:
#Pilih hanya kolom yang akan kita gunakan dan skalakan
selected = ['Pclass', 'Sex', 'Age', 'Fare', 'family_size', 'family_survival']
scaler  = StandardScaler()
scaler.fit(combined[selected])
combined[selected] = scaler.transform(combined[selected])

In [18]:
combined.to_parquet('titanic_family_survivabillity.parquet', index=False)

In [19]:
train = combined.loc[combined['set'] == 'train'].drop('set', axis=1).reset_index(drop=True)
test = combined.loc[combined['set'] == 'test'].drop(['set', 'Survived'], axis=1).reset_index(drop=True)

**MODEL KNN**

In [20]:
def euclidean_distance(vector1, vector2):
    return np.sqrt(np.sum((vector1 - vector2)**2))

# test function
vec1 = np.array([3, 0])
vec2 = np.array([0, 4])

# this is the 3:4:5 triangle and therefore, it should return 5 (Long live Pythagoras)
euclidean_distance(vec1, vec2)

5.0

In [21]:
# A first implementation
def get_nearest_neighbor(vector, dataset, number_of_neighbors=1, ignore_cols=['Survived']):
    distances = []
    for ix, row in dataset.loc[:, ~dataset.columns.isin(ignore_cols)].iterrows():
        distance = euclidean_distance(row, vector)
        distances.append((distance, ix))
    indices = [x[1] for x in sorted(distances, key=lambda x: x[0])]
    neighbors = dataset.loc[indices[:number_of_neighbors]]
    return neighbors

# Another implementation using Pandas
def get_nearest_neighbor(vector, dataset, number_of_vectors=1, ignore_cols=['Survived'], not_count_duplicates=False):
    ds = dataset.copy()
    ds['distance'] = ds.loc[:, ~ds.columns.isin(ignore_cols)].apply(
        lambda x: euclidean_distance(x, vector), axis=1)
    if not_count_duplicates:
        distances = sorted(ds.distance.unique())[:number_of_vectors]
        return ds.loc[ds.distance <= max(distances)].drop('distance', axis=1)
    return ds.sort_values('distance', ascending=True).head(number_of_vectors).drop('distance', axis=1)
        
# test function
dataset = pd.DataFrame([
    {'a': 1, 'b': 1, 'Survived': 1},
    {'a': 2, 'b': 2, 'Survived': 1},
    {'a': 3, 'b': 3, 'Survived': 0},
    {'a': 4, 'b': 4, 'Survived': 0},
    {'a': 5, 'b': 5, 'Survived': 0},
])
vector = pd.Series({'a': 2.5, 'b': 2.5})

# should be (2,2) and (3,3) (if keeping track of duplicates)
get_nearest_neighbor(vector, dataset)

Unnamed: 0,a,b,Survived
1,2,2,1


In [22]:
def predict(vector, dataset, number_of_neighbors=1, y='Survived'):
    neighbors = get_nearest_neighbor(vector, dataset, number_of_neighbors)
    return round(neighbors[y].mean())

# test function
print(predict(vector, dataset))
print(predict(pd.Series({'a': 4.5, 'b': 4.5}), dataset))

1
0


In [24]:
#Membuat Prediksi 
def predict_testset(test_dataset, train_dataset, number_of_neighbors=1):
    ds = test_dataset.copy()
    select = selected + ['Survived']
    
    def predict_row(vector, dataset):
        if vector.name % 100 == 0:
            print(vector.name)
        return int(predict(vector, dataset[select], number_of_neighbors))

    ds['Survived'] = ds.loc[:, ds.columns.isin(selected)].apply(
        lambda x: predict_row(x, train_dataset), axis=1)
    
    return ds

In [27]:
final_test = predict_testset(test, train, number_of_neighbors=10)
result = final_test[['PassengerId', 'Survived']].copy()
result

0
100
200
300
400


Unnamed: 0,PassengerId,Survived
0,1227,0
1,1050,0
2,1128,0
3,1083,0
4,1158,0
...,...,...
413,1114,1
414,925,0
415,1136,0
416,1059,0
