In [1]:
import pandas
from IPython.display import display
import torch
import torch.nn as nn
import numpy


In [2]:
#원 핫 인코딩 함수
def one_hot_function(df : pandas.DataFrame, cols : list) :
    for c in cols :        
        onehot = pandas.get_dummies(df[c], c)
        df = df.join(onehot)
        df = df.drop(c, axis = 1)
    return df;

#이상치 제거 함수
def del_anomaly_func(df : pandas.DataFrame, cols : list, factor : float = 2) :
    for c in cols :
        min_lim = df[c].mean() - df[c].std() * factor
        max_lim = df[c].mean() + df[c].std() * factor
        df = df[(df[c] > min_lim) & (df[c] < max_lim)]
    return df;

#정규화 함수
def normalize_func(df : pandas.DataFrame, cols : list, origin_df : pandas.DataFrame = None) :
    if origin_df is None :
        for c in cols :
            df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())
    else :
        for c in cols :
            df[c] = (df[c] - origin_df[c].min()) / (origin_df[c].max() - origin_df[c].min())
    return df;
        
origin_df = pandas.read_csv("train.csv", encoding = "UTF-8", index_col= 0)
df = origin_df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]

#원 핫 인코딩
df = one_hot_function(df, ["Sex", "Pclass"])

#결측치 치환
df = df.fillna(df.mean())

#이상치 제거
df = del_anomaly_func(df, ["Age","Fare"], 2)

#정규화
df = normalize_func(df, ["Age","Fare"])

display(origin_df)
display(df)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0.349515,1,0,0.060417,False,True,False,False,True
2,1,0.660194,1,0,0.594027,True,False,True,False,False
3,1,0.427184,0,0,0.066042,True,False,False,False,True
4,1,0.601942,1,0,0.442500,True,False,True,False,False
5,0,0.601942,0,0,0.067083,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...
887,0,0.446602,0,0,0.108333,False,True,False,True,False
888,1,0.291262,0,0,0.250000,True,False,True,False,False
889,0,0.499012,1,2,0.195417,True,False,False,False,True
890,1,0.427184,0,0,0.250000,False,True,True,False,False


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

x = df.iloc[:,1:]
t = df["Survived"]

tensor_x = torch.tensor(x.values.tolist(), dtype = torch.float, device = device)
tensor_t = torch.tensor(t.values.tolist(), dtype = torch.long, device = device)

input_len = tensor_x.shape[-1]
hidden_len = [8, 4]
output_len = 2

F = nn.Sequential(
    nn.Linear(input_len, hidden_len[0], device = device),
    nn.Sigmoid(),
    nn.Linear(hidden_len[0], hidden_len[1], device = device),
    nn.Sigmoid(),
    nn.Linear(hidden_len[1], output_len, device = device),
    nn.Sigmoid(),
)
loss_func = nn.CrossEntropyLoss();
optimizer = torch.optim.SGD(F.parameters(), lr = 0.05)
epoch = 200

for i in range(epoch) :
    loss_sum = 0
    for b in range(tensor_x.shape[0]) :
        tensor_y = F(tensor_x[b])

        loss = loss_func(tensor_y, tensor_t[b])
        loss_sum += loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    if (i+1) % 10 == 0 :
        print("epoch {} - loss {}".format(i+1, loss_sum / tensor_x.shape[0]))

epoch 10 - loss 0.6511396169662476
epoch 20 - loss 0.5369836091995239
epoch 30 - loss 0.49883174896240234
epoch 40 - loss 0.4935159981250763
epoch 50 - loss 0.4913647174835205
epoch 60 - loss 0.49026113748550415
epoch 70 - loss 0.48947691917419434
epoch 80 - loss 0.488834023475647
epoch 90 - loss 0.48829150199890137
epoch 100 - loss 0.4877815246582031
epoch 110 - loss 0.48724713921546936
epoch 120 - loss 0.48668891191482544
epoch 130 - loss 0.48615825176239014
epoch 140 - loss 0.4856939911842346
epoch 150 - loss 0.4852961301803589
epoch 160 - loss 0.48494860529899597
epoch 170 - loss 0.48463305830955505
epoch 180 - loss 0.48433271050453186
epoch 190 - loss 0.4840329587459564
epoch 200 - loss 0.4837302267551422


In [4]:
validation1 = pandas.read_csv("test.csv", encoding = "UTF-8", index_col= 0)
validation2 = pandas.read_csv("gender_submission.csv", encoding = "UTF-8", index_col = 0)

validation = validation2.join(validation1)
validation = validation[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]

#원 핫 인코딩
validation = one_hot_function(validation, ["Sex", "Pclass"])

#결측지 치환
validation = validation.fillna(validation.mean())

#정규화
validation = normalize_func(validation, ["Age", "Fare"], origin_df)

display(validation)

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,0,0.428248,0,0,0.015282,False,True,False,False,True
893,1,0.585323,1,0,0.013663,True,False,False,False,True
894,0,0.773813,0,0,0.018909,False,True,False,True,False
895,0,0.334004,0,0,0.016908,False,True,False,False,True
896,1,0.271174,1,1,0.023984,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...
1305,0,0.375127,0,0,0.015713,False,True,False,False,True
1306,1,0.484795,0,0,0.212559,True,False,True,False,False
1307,0,0.478512,0,0,0.014151,False,True,False,False,True
1308,0,0.375127,0,0,0.015713,False,True,False,False,True


In [5]:
x = validation.iloc[:,1:]
t = validation["Survived"]

tensor_x = torch.tensor(x.values.tolist(), dtype = torch.float, device = device)
tensor_t = torch.tensor(t.values.tolist(), dtype = torch.long, device = device)


cnt_correct = 0
errors = []
for i in range(len(tensor_x)) :
    tensor_y = F(tensor_x[i])
    output = torch.argmax(tensor_y)
    answer = tensor_t[i]
    if output == answer :
        cnt_correct += 1
    else :
        errors.append(i+892)

print(cnt_correct / len(tensor_x))
print(errors)

0.9569377990430622
[893, 915, 924, 925, 956, 1019, 1024, 1032, 1034, 1057, 1080, 1088, 1106, 1198, 1201, 1257, 1268, 1295]


In [6]:
torch.save(F,"titanic.pt")