In [1]:
# data analysis and wrangling
import numpy as np
import pandas as pd
import json

# data standardization
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# deep learning
from keras import models
from keras import layers
from tensorflow.keras.utils import to_categorical

In [2]:
train_df = pd.read_csv("preprocessed_train.csv", encoding='euc-kr')

In [3]:
train_df

Unnamed: 0,분석데이터,label,numstrings,avlength,printables,entropy,paths,urls,registry,MZ,...,dist_86,dist_87,dist_88,dist_89,dist_90,dist_91,dist_92,dist_93,dist_94,dist_95
0,1,1,144,12.298611,1771,5.356616,0,0,0,1,...,10,4,10,9,4,0,1,0,0,0
1,2,1,804,9.580846,7703,6.063542,0,0,0,6,...,43,121,84,78,47,36,40,45,27,36
2,3,0,2205,12.736054,28083,6.107050,9,0,0,6,...,326,268,239,286,199,148,154,37,48,36
3,4,0,2602,10.288240,26770,5.373013,8,0,0,1,...,336,230,206,245,76,0,26,702,1,5
4,5,1,8980,23.252339,208806,5.775223,0,28,16,3,...,731,882,1171,1010,322,64,327,84,75,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,1,2018,13.938057,28127,5.940442,0,70,0,11,...,246,186,206,235,88,33,81,58,61,72
9996,9997,0,1105,16.437104,18163,5.766962,0,11,0,3,...,199,57,134,123,20,25,28,25,41,13
9997,9998,0,4,58.500000,234,3.811827,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9998,9999,1,3312,24.939312,82599,5.834730,0,39,0,8,...,438,985,806,851,113,123,181,100,75,86


In [4]:
train_df = train_df.drop(columns=['분석데이터'], axis=1)

In [5]:
train_images = train_df[:8000]
train_labels = train_images['label']

test_images = train_df[8000:]
test_labels = test_images['label']


train_images = train_images.drop(columns=['label'], axis=1)
test_images = test_images.drop(columns=['label'], axis=1)

In [6]:
train_images.shape

(8000, 616)

In [7]:
test_images.shape

(2000, 616)

In [8]:
train_labels.shape

(8000,)

In [9]:
test_labels.shape

(2000,)

## 데이터 정형화

In [10]:
train_images.max()

numstrings    3.786600e+05
avlength      5.447628e+04
printables    2.179051e+07
entropy       6.584870e+00
paths         9.897000e+03
                  ...     
dist_91       8.483400e+04
dist_92       3.760200e+04
dist_93       5.148000e+04
dist_94       1.326000e+04
dist_95       1.492800e+04
Length: 616, dtype: float64

In [11]:
train_images.columns

Index(['numstrings', 'avlength', 'printables', 'entropy', 'paths', 'urls',
       'registry', 'MZ', 'a_0', 'a_1',
       ...
       'dist_86', 'dist_87', 'dist_88', 'dist_89', 'dist_90', 'dist_91',
       'dist_92', 'dist_93', 'dist_94', 'dist_95'],
      dtype='object', length=616)

In [12]:
scaler = MinMaxScaler()
scaler.fit(train_images)

MinMaxScaler()

In [13]:
train_images_scaled = scaler.transform(train_images)

In [14]:
train_images_scaled

array([[3.75008583e-04, 1.30087248e-04, 7.92089531e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.11800622e-03, 8.01935048e-05, 3.51438101e-04, ...,
        8.74125874e-04, 2.03619910e-03, 2.41157556e-03],
       [5.81791485e-03, 1.38117995e-04, 1.28670952e-03, ...,
        7.18725719e-04, 3.61990950e-03, 2.41157556e-03],
       ...,
       [1.38726767e-02, 2.55992929e-05, 1.59129227e-03, ...,
        3.49650350e-03, 1.36500754e-02, 1.29287245e-02],
       [3.06767584e-02, 1.23059848e-05, 3.13453599e-03, ...,
        1.16938617e-02, 4.71342383e-02, 4.00589496e-02],
       [1.62151599e-03, 1.80783041e-04, 4.23671527e-04, ...,
        2.52525253e-04, 1.28205128e-03, 1.27277599e-03]])

In [15]:
scaler = MinMaxScaler()
scaler.fit(train_images)

MinMaxScaler()

In [16]:
train_images_scaled.shape

(8000, 616)

In [17]:
scaler = MinMaxScaler()
scaler.fit(test_images)

MinMaxScaler()

In [18]:
test_images_scaled = scaler.transform(test_images)

In [19]:
test_images_scaled

array([[8.38151115e-03, 3.36092560e-03, 2.21938781e-02, ...,
        7.09829267e-04, 1.75775661e-03, 1.19960829e-02],
       [2.00965779e-03, 3.30227028e-03, 5.25745958e-03, ...,
        1.68117458e-04, 4.49658668e-04, 6.52848050e-04],
       [3.97978913e-02, 1.25338195e-04, 4.09693693e-02, ...,
        1.21044570e-02, 3.87932796e-02, 2.87661172e-02],
       ...,
       [4.76222224e-06, 1.88741866e-02, 8.00886484e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.88147781e-03, 6.93749339e-03, 3.49822131e-02, ...,
        1.86797176e-03, 3.06585456e-03, 3.50905827e-03],
       [1.55486556e-03, 2.28950120e-04, 1.66787788e-03, ...,
        4.29633504e-04, 1.06282958e-03, 1.30569610e-03]])

In [20]:
test_images_scaled.shape

(2000, 616)

### resize array for image

In [21]:
x = train_images_scaled

x1 = np.zeros((8000,22,28))

for i in range(len(train_images_scaled)):
    x1[i] = x[i].reshape((22,28))
            
train_images_pre = x1

In [22]:
train_images_pre.shape

(8000, 22, 28)

In [23]:
x = test_images_scaled

x1 = np.zeros((2000,22,28))

for i in range(len(test_images_scaled)):
    x1[i] = x[i].reshape((22,28))
            
test_images_scaled_pre = x1

In [24]:
test_images_scaled_pre.shape

(2000, 22, 28)

## deep learning layer

In [26]:
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(22 * 28,)))
network.add(layers.Dense(2, activation='softmax'))

In [27]:
network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [28]:
train_images = train_images_pre.reshape((8000, 22 * 28))
train_images = train_images.astype('float32') / 255

test_images = test_images_scaled_pre.reshape((2000, 22 * 28))
test_images = test_images.astype('float32') / 255

In [29]:
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

In [31]:
network.fit(train_images, train_labels, epochs=5, batch_size=128)

Epoch 1/5


ValueError: in user code:

    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 809, in train_step
        loss = self.compiled_loss(
    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\losses.py", line 1664, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\jesung\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\backend.py", line 4994, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 2) and (None, 10) are incompatible
