In [1]:
# data analysis and wrangling
import numpy as np
import pandas as pd
import json

# data standardization
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# deep learning
from keras import models
from keras import layers
from tensorflow.keras.utils import to_categorical

In [2]:
train_df = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train_df

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,10,4,10,9,4,0,1,0,0,0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,43,121,84,78,47,36,40,45,27,36
2,3,0,2205,12.736054,28083,6.107050,9,0,0,6,...,326,268,239,286,199,148,154,37,48,36
3,4,0,2602,10.288240,26770,5.373013,8,0,0,1,...,336,230,206,245,76,0,26,702,1,5
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,731,882,1171,1010,322,64,327,84,75,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,1,2018,13.938057,28127,5.940442,0,70,0,11,...,246,186,206,235,88,33,81,58,61,72
9996,9997,0,1105,16.437104,18163,5.766962,0,11,0,3,...,199,57,134,123,20,25,28,25,41,13
9997,9998,0,4,58.500000,234,3.811827,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9998,9999,1,3312,24.939312,82599,5.834730,0,39,0,8,...,438,985,806,851,113,123,181,100,75,86


In [4]:
train_df = train_df.drop(columns=['분석데이터'], axis=1)

In [5]:
train_images = train_df[:9000]
train_labels = train_images['label']

test_images = train_df[9000:]
test_labels = test_images['label']


train_images = train_images.drop(columns=['label'], axis=1)
test_images = test_images.drop(columns=['label'], axis=1)

In [6]:
train_images.shape

(9000, 616)

In [7]:
test_images.shape

(1000, 616)

In [8]:
train_labels.shape

(9000,)

In [9]:
test_labels.shape

(1000,)

## 데이터 정형화

In [10]:
train_images.max()

numstrings    4.199740e+05
avlength      5.447628e+04
printables    2.179051e+07
entropy       6.584918e+00
paths         9.897000e+03
                  ...     
dist_91       8.483400e+04
dist_92       3.760200e+04
dist_93       5.353400e+04
dist_94       2.446300e+04
dist_95       2.450800e+04
Length: 616, dtype: float64

In [11]:
train_images.columns

Index(['numstrings', 'avlength', 'printables', 'entropy', 'paths', 'urls',
       'registry', 'MZ', 'a_0', 'a_1',
       ...
       'dist_86', 'dist_87', 'dist_88', 'dist_89', 'dist_90', 'dist_91',
       'dist_92', 'dist_93', 'dist_94', 'dist_95'],
      dtype='object', length=616)

In [12]:
# MinMax는 이상치에 취약하다

In [13]:
train_images = train_images.to_numpy()

In [14]:
scaler = RobustScaler()

In [15]:
scaler.fit(train_images)
train_images = scaler.transform(train_images)

In [16]:
scaler.fit(test_images)
test_images = scaler.transform(test_images)

In [17]:
train_images

array([[-3.21389709e-01, -2.12231511e-02, -2.68828645e-01, ...,
        -2.38841978e-01, -2.37623762e-01, -2.16814159e-01],
       [-2.18793720e-01, -2.81995880e-01, -1.96900113e-01, ...,
        -2.17129071e-02, -1.03960396e-01, -5.75221239e-02],
       [-1.01041505e-03,  2.07500408e-02,  5.02178045e-02, ...,
        -6.03136309e-02,  0.00000000e+00, -5.75221239e-02],
       ...,
       [ 7.46541272e-01,  1.44205242e-02,  7.87267001e-01, ...,
         9.91556092e-01,  1.89108911e+00,  7.25663717e-01],
       [-2.73511581e-01,  1.40170444e+00, -1.41619908e-01, ...,
        -2.09891435e-01, -1.63366337e-01, -1.99115044e-01],
       [-2.93719882e-01,  1.80467142e-01, -2.34076930e-01, ...,
        -2.14716526e-01, -1.98019802e-01, -1.90265487e-01]])

In [18]:
test_images

array([[-0.29223936, -0.13605312, -0.26185653, ..., -0.19379845,
        -0.16783217, -0.18114603],
       [-0.26230196,  0.03702695, -0.22914139, ..., -0.19379845,
        -0.17182817, -0.16266174],
       [ 0.04896303, -0.37241977, -0.03668537, ..., -0.13565891,
        -0.04795205, -0.07393715],
       ...,
       [-0.30888679,  4.27248369, -0.27422762, ..., -0.20930233,
        -0.1958042 , -0.18853974],
       [ 0.15388382,  1.14536208,  0.61801824, ...,  0.17829457,
         0.1038961 ,  0.12939002],
       [-0.21781555, -0.61211214, -0.23363701, ..., -0.12015504,
        -0.09190809, -0.0702403 ]])

### resize array for image

In [19]:
x = train_images

x1 = np.zeros((9000,22,28))

for i in range(len(train_images)):
    x1[i] = x[i].reshape((22,28))
            
train_images_pre = x1

In [20]:
train_images_pre.shape

(9000, 22, 28)

In [21]:
x = test_images

x1 = np.zeros((1000,22,28))

for i in range(len(test_images)):
    x1[i] = x[i].reshape((22,28))
            
test_images_scaled_pre = x1

In [22]:
test_images_scaled_pre.shape

(1000, 22, 28)

## deep learning layer

In [23]:
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(22 * 28,)))
network.add(layers.Dense(2, activation='softmax'))

In [24]:
network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [25]:
train_images = train_images_pre.reshape((9000, 22 * 28))
train_images = train_images.astype('float32') / 255

test_images = test_images_scaled_pre.reshape((1000, 22 * 28))
test_images = test_images.astype('float32') / 255

In [26]:
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

In [27]:
network.fit(train_images, train_labels, epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x14811b0a8e0>

In [28]:
test_loss, test_acc = network.evaluate(test_images, test_labels)



In [29]:
print('test_acc:', test_acc)

test_acc: 0.7400000095367432
