In [6]:
import pandas
from IPython.display import display
import torch
import torch.nn as nn
import numpy


In [7]:
#원 핫 인코딩 함수
def one_hot_function(df : pandas.DataFrame, cols : list) :
    for c in cols :        
        onehot = pandas.get_dummies(df[c], c)
        df = df.join(onehot)
        df = df.drop(c, axis = 1)
    return df;

#이상치 제거 함수
def del_anomaly_func(df : pandas.DataFrame, cols : list, factor : float = 2) :
    for c in cols :
        min_lim = df[c].mean() - df[c].std() * factor
        max_lim = df[c].mean() + df[c].std() * factor
        df = df[(df[c] > min_lim) & (df[c] < max_lim)]
    return df;

#정규화 함수
def normalize_func(df : pandas.DataFrame, cols : list, origin_df : pandas.DataFrame = None) :
    if origin_df is None :
        for c in cols :
            df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())
    else :
        for c in cols :
            df[c] = (df[c] - origin_df[c].min()) / (origin_df[c].max() - origin_df[c].min())
    return df;
        
origin_df = pandas.read_csv("train.csv", encoding = "UTF-8", index_col= 0)
df = origin_df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]

#원 핫 인코딩
df = one_hot_function(df, ["Sex", "Pclass"])

#결측치 치환
df = df.fillna(df.mean())

#이상치 제거
df = del_anomaly_func(df, ["Age","Fare"], 2)

#정규화
df = normalize_func(df, ["Age","Fare"])

display(origin_df)
display(df)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0.349515,1,0,0.060417,False,True,False,False,True
2,1,0.660194,1,0,0.594027,True,False,True,False,False
3,1,0.427184,0,0,0.066042,True,False,False,False,True
4,1,0.601942,1,0,0.442500,True,False,True,False,False
5,0,0.601942,0,0,0.067083,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...
887,0,0.446602,0,0,0.108333,False,True,False,True,False
888,1,0.291262,0,0,0.250000,True,False,True,False,False
889,0,0.499012,1,2,0.195417,True,False,False,False,True
890,1,0.427184,0,0,0.250000,False,True,True,False,False


In [9]:
x = df.iloc[:,1:]
t = df["Survived"]

tensor_x = torch.tensor(x.values.tolist(), dtype = torch.float)
tensor_t = torch.tensor(t.values.tolist(), dtype = torch.float)

input_len = len(tensor_x[0])
hidden_len = [8, 4]
output_len = 2

F = nn.Sequential(
    nn.Linear(input_len, hidden_len[0]),
    nn.Sigmoid(),
    nn.Linear(hidden_len[0], hidden_len[1]),
    nn.Sigmoid(),
    nn.Linear(hidden_len[1], output_len),
    nn.Sigmoid(),
    nn.LogSoftmax(dim = 0)
)
loss_func = nn.CrossEntropyLoss();
optimizer = torch.optim.SGD(F.parameters(), lr = 0.5)
epoch = 200

for i in range(epoch) :
    tot_loss = 0
    for j in range(len(tensor_x)) :
        tensor_y = F(tensor_x[j])
        optimizer.zero_grad()
        loss = loss_func(tensor_y, tensor_t[j].type(torch.long))
        loss.backward()
        optimizer.step()

        tot_loss += loss

    print("epoch {} - loss {}".format(i, tot_loss/len(tensor_x)))

epoch 0 - loss 0.6567715406417847
epoch 1 - loss 0.6284425854682922
epoch 2 - loss 0.5206999182701111
epoch 3 - loss 0.510297954082489
epoch 4 - loss 0.5097793340682983
epoch 5 - loss 0.5074911117553711
epoch 6 - loss 0.5033590793609619
epoch 7 - loss 0.5008605122566223
epoch 8 - loss 0.4996086359024048
epoch 9 - loss 0.4985460638999939
epoch 10 - loss 0.4975040853023529
epoch 11 - loss 0.4964461028575897
epoch 12 - loss 0.495372474193573
epoch 13 - loss 0.49432095885276794
epoch 14 - loss 0.4933411478996277
epoch 15 - loss 0.49245908856391907
epoch 16 - loss 0.49166053533554077
epoch 17 - loss 0.49093109369277954
epoch 18 - loss 0.4902629554271698
epoch 19 - loss 0.4896547198295593
epoch 20 - loss 0.4891040325164795
epoch 21 - loss 0.4886089861392975
epoch 22 - loss 0.48816147446632385
epoch 23 - loss 0.48775383830070496
epoch 24 - loss 0.4873802363872528
epoch 25 - loss 0.4870390295982361
epoch 26 - loss 0.4867287874221802
epoch 27 - loss 0.48644644021987915
epoch 28 - loss 0.4861877

In [3]:
F = torch.load("titanic.pt", weights_only= False)

In [10]:
validation1 = pandas.read_csv("test.csv", encoding = "UTF-8", index_col= 0)
validation2 = pandas.read_csv("gender_submission.csv", encoding = "UTF-8", index_col = 0)

validation = validation2.join(validation1)
validation = validation[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]

#원 핫 인코딩
validation = one_hot_function(validation, ["Sex", "Pclass"])

#결측지 치환
validation = validation.fillna(validation.mean())

#정규화
validation = normalize_func(validation, ["Age", "Fare"])

display(validation)

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,0,0.452723,0,0,0.015282,False,True,False,False,True
893,1,0.617566,1,0,0.013663,True,False,False,False,True
894,0,0.815377,0,0,0.018909,False,True,False,True,False
895,0,0.353818,0,0,0.016908,False,True,False,False,True
896,1,0.287881,1,1,0.023984,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...
1305,0,0.396975,0,0,0.015713,False,True,False,False,True
1306,1,0.512066,0,0,0.212559,True,False,True,False,False
1307,0,0.505473,0,0,0.014151,False,True,False,False,True
1308,0,0.396975,0,0,0.015713,False,True,False,False,True


In [12]:
x = validation.iloc[:,1:]
t = validation["Survived"]

tensor_x = torch.tensor(x.values.tolist(), dtype = torch.float)
tensor_t = torch.tensor(t.values.tolist(), dtype = torch.float)

cnt_correct = 0
errors = []
for i in range(len(tensor_x)) :
    tensor_y = F(tensor_x[i])
    if (torch.eq(torch.argmax(tensor_y), tensor_t[i].type(torch.float))) :
        cnt_correct += 1
    else :
        errors.append(i+892)

print(cnt_correct / len(tensor_x))
print(errors)

0.937799043062201
[893, 915, 924, 925, 956, 981, 1019, 1024, 1032, 1057, 1073, 1080, 1086, 1088, 1106, 1109, 1134, 1141, 1165, 1185, 1198, 1200, 1201, 1251, 1268, 1299]


In [13]:
torch.save(F,"titanic.pt")