In [3]:
import pandas as pd
import numpy as np
#import gc

import mlcrate as mlc
import pickle as pkl
import sklearn.metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras import Input, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, Dense, Flatten, Concatenate, 
                                     Dot, Reshape, Add, Subtract, BatchNormalization)
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import plot_model

2024-03-03 19:44:37.933563: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
data_path = "/Users/yu.zadorozhnaya/Machine-Learning-and-Predictive-Analytics-MSU-/machine_learning_08_data/" 

In [5]:
train = pd.read_csv(data_path + 'train_sample.csv')
test = pd.read_csv(data_path + 'test_sample.csv')

In [6]:
features = ['feature_0', 'feature_1', 'feature_2']
f_size  = [int(train[f].max()) + 1 for f in features]
X_train = [train[f].values for f in features]
X_test = [test[f].values for f in features]
Y_train = [train['target'].values]

In [7]:
n_epochs = 300
batch_size = 128

In [8]:
k_latent = 2
embedding_reg = 0.0002 
kernel_reg = 0.1 

In [9]:
out_dim=2
def get_embed(x_input, x_size, out_dim,test_weights=False):
    # x_input is index of input (either user or item)
    # x_size is length of vocabulary (e.g. total number of users or items)
    # test_weights is a demo flag to show results with predefined weights
    # out_dim is size of embedding vectors
    if x_size > 0: #category
        if test_weights & (out_dim<=2):
            embed = Embedding(x_size, out_dim, input_length=1,
                          weights=[weight_matrix[:x_size,:out_dim]], 
                          embeddings_regularizer=l2(embedding_reg))(x_input)
        else:
            embed = Embedding(x_size, out_dim, input_length=1,
                              embeddings_regularizer=l2(embedding_reg))(x_input)
        embed = Flatten()(embed)
    else:
        embed = Dense(out_dim, kernel_regularizer=l2(embedding_reg))(x_input)
    return embed

In [10]:
def build_model(f_size):
    dim_input = len(f_size)
    
    input_x = [Input(shape=(1,)) for i in range(dim_input)] 
    
    lin_terms = [get_embed(x, size, 1) for (x, size) in zip(input_x, f_size)]

    factors = [get_embed(x, size, k_latent) for (x, size) in zip(input_x, f_size)]
     
    s = Add()(factors)
    
    diffs = [Subtract()([s, x]) for x in factors]
    
    dots = [Dot(axes=1)([d, x]) for d,x in zip(diffs, factors)]
    
    x = Concatenate()(lin_terms + dots)
    x = BatchNormalization()(x)
    output = Dense(1, activation='relu', kernel_regularizer=l2(kernel_reg))(x)
    model = Model(inputs=input_x, outputs=[output])
    model.compile(optimizer=Adam(clipnorm=0.25,learning_rate=0.001), 
                  loss='mean_squared_error')
    return model

In [11]:
model = build_model(f_size)

In [12]:
earlystopper = EarlyStopping(patience=2, verbose=1)
t = mlc.time.Timer()

In [13]:
model.fit(X_train,  Y_train, 
          epochs=n_epochs, batch_size=batch_size, verbose=0, shuffle=True,
          callbacks=[earlystopper])



<keras.callbacks.History at 0x149f8bee0>

In [14]:
predictions = model.predict(X_train)
#RMSE = sklearn.metrics.mean_squared_error(Y_train, predictions)
predictions_ = [[i[0] for i in predictions]]
RMSE = sklearn.metrics.mean_squared_error(Y_train, predictions_)
RMSE



0.053834440300800125

In [15]:
predictions = model.predict(X_test)
predictions



array([[1.58290970e+00],
       [0.00000000e+00],
       [2.57630682e+00],
       [1.47040212e+00],
       [1.27052021e+00],
       [2.31077567e-01],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [9.72022593e-01],
       [1.39191067e+00],
       [1.05047715e+00],
       [8.76273453e-01],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [4.31310272e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [4.72260904e+00],
       [1.45628154e+00],
       [0.00000000e+00],
       [3.54911065e+00],
       [7.07393467e-01],
       [1.69432151e+00],
       [5.40356970e+00],
       [0.00000000e+00],
       [7.18307048e-02],
       [0.00000000e+00],
       [2.37903810e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [2.07286096e+00],
       [1.36240256e+00],
       [4.01384771e-01],
       [0.00000000e+00],
       [1.96696126e+00],


In [16]:
sub = pd.read_csv(data_path + 'test_sample.csv',usecols=['ID'])
sub['target'] = predictions
sub.to_csv('submission.csv',index=False)