# Reading in data

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


In [2]:
CURRENT_DIR = os.getcwd()
X_TRAIN_DIR = CURRENT_DIR + "\\data\\data_X.csv"
Y_TRAIN_DIR = CURRENT_DIR + "\\data\\data_Y.csv"
X_TEST_DIR = CURRENT_DIR + "\\data\\X_for_grading.csv"

X_train = pd.read_csv(X_TRAIN_DIR)
y_train = pd.read_csv(Y_TRAIN_DIR).squeeze().ravel()
X_test = pd.read_csv(X_TEST_DIR)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(4000, 12)
(4000,)
(2000, 12)


In [3]:
def process_X(X):

    # Define preprocessing steps for numerical and categorical features
    numeric_features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X10', 'X11']
    numeric_transformer = StandardScaler()

    categorical_features = ['X7', 'X8', 'X9', 'X12']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Apply transformations to numerical features
    X_numeric = numeric_transformer.fit_transform(X[numeric_features])

    # Apply transformations to categorical features
    X_categorical = categorical_transformer.fit_transform(X[categorical_features])

    # Concatenate transformed numerical and categorical features
    X_processed = pd.concat([pd.DataFrame(X_numeric, columns=numeric_features),
                            pd.DataFrame(X_categorical.toarray(), columns=categorical_transformer.get_feature_names_out(categorical_features))],
                        axis=1)
    
    return X_processed

In [4]:
X_train_processed = process_X(X_train)
X_test_processed = process_X(X_test)

In [5]:
X_train_processed.columns

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X10', 'X11', 'X7_0', 'X7_1',
       'X8_0', 'X8_1', 'X9_0', 'X9_1', 'X12_A', 'X12_B', 'X12_C'],
      dtype='object')

In [7]:
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=17))
model.add(Dropout(0.1))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(units=1, activation='linear'))
model.compile(optimizer="Adamax", loss='mean_squared_error')

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1152      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,377
Trainable params: 5,377
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.fit(X_train_processed, y_train, epochs=300, batch_size=32, verbose=0)

<keras.callbacks.History at 0x26032957010>

In [9]:
y_pred = model.predict(X_test_processed)

y_pred.shape

y_pred_df = pd.DataFrame({'Y_predict': y_pred.flatten()})

y_pred_df.to_csv('y_for_grading.csv', index=False)

