In [4]:
# import sys
# !{sys.executable} -m pip install pandas
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Activation

import random

In [64]:
def normalize(matrix, multiplier):
    """
    Normalize the input matrix to a range between 0 and `multiplier`.
    
    Args:
    - matrix: a NumPy array
    - multiplier: the maximum value of the normalized matrix
    
    Returns:
    - The normalized matrix as a NumPy array.
    """
    
    for i in range(matrix.shape[1]):
        # Find the maximum and minimum values in the vector
        column_max = np.max(matrix[:, i])
        column_min = np.min(matrix[:, i])
        
        # Compute the range of the vector, taking care to handle the case where max = min
        vector_range = 1 if column_max == column_min else column_max - column_min
        
        # Normalize the vector to a range between 0 and `multiplier`
        matrix[:, i] = (matrix[:, i] - column_min) / (vector_range) * multiplier
        
        # Return the normalized vector
    return matrix

In [65]:
categorical_set = {'A1': [], 'A2': [], 'A3': [], 'A4': [], 'A5': [], 'A6': [], 'A7': [], 'A8': [], 'A9': [], 'A10': [], 'A11': [], 'A12': [], 'A13': [], 'A14': [], 'A15': []}
continuous = {'A2', 'A3', 'A8', 'A11', 'A14', 'A15'}
categoricals = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

with open('crx.data', 'r') as data_file:
    lines = data_file.readlines()
data_file.close()

labels = []
attributes = list(categorical_set.keys())
previous_line = ['a', '64.08', '0.165', 'u', 'g', 'ff', 'ff', '0', 't', 't', '01', 'f', 'g', '00232', '100']
for line in lines:
    record = line[:-1].split(',')

    no_question_marks = 0
    for i in range(len(attributes)):
        key = attributes[i]
        value = record[i]

        if value == '?':
            value = previous_line[i]
            no_question_marks += 1

        value = -np.float32(value) if key in continuous else value

        categorical_set[key].append(value)

    labels.append(1) if record[-1] == '+' else labels.append(0)
    previous_line = record if no_question_marks == 0 else previous_line

data = pd.DataFrame(categorical_set)
encoded_data = pd.get_dummies(data, categoricals)

encoded_columns = encoded_data.columns

encoded_data = normalize(encoded_data.to_numpy(), 1)
encoded_data = pd.DataFrame(encoded_data)
encoded_data.columns = encoded_columns


labels = pd.DataFrame(labels)


encoded_data.to_csv('samples.csv', index=False)
labels.to_csv('labels.csv', index=False)

In [66]:
encoded_data = pd.read_csv('samples.csv')
labels = pd.read_csv('labels.csv')
random.seed(150)

In [67]:
x_train, x_test, y_train, y_test = train_test_split(encoded_data, labels, test_size=0.2, random_state=2164)

In [74]:
model = Sequential([
    Dense(1024, activation='relu'),
    
    Dense(256, activation='relu'),

    Dense(128, activation='relu'),

    Dense(32, activation='relu'),

    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


model.fit(x_train, y_train, epochs=7, batch_size=60)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x21c12fb1ae0>

In [75]:
results = model.evaluate(x_test, y_test, verbose = 0)
print('test loss, test acc:', results)

test loss, test acc: [0.42948025465011597, 0.8550724387168884]
