In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

class dataloader:
    def __init__(self,datapath):
        self.datapath =datapath
        ratings_df = pd.read_csv(os.path.join(self.datapath,"ratings.csv"),encoding='utf-8')
        ratings_df.drop('timestamp', inplace=True, axis=1)

        movies_df = pd.read_csv(os.path.join(self.datapath, "movies.csv"), encoding='utf-8')
        movies_df = movies_df.set_index("movieId")
        dummy_genre_df =  movies_df['genres'].str.get_dummies(sep='|')


        movies_df['year'] = movies_df["title"].str.extract('(\(\d\d\d\d\))')
        movies_df['year'] = movies_df['year'].astype('str')
        movies_df['year'] = movies_df['year'].map(lambda x: x.replace("(", "").replace(")", ""))
        movies_df.loc[movies_df['year'] =='nan', 'year'] = '1980'        
        movies_df['year'] = movies_df['year'].astype("float32").astype("int32")
        movies_df.drop(movies_df[movies_df['year'] == 0].index, inplace=True, axis=0)
        movies_df.drop('title',axis=1,inplace=True)
        bins = list(range(1900, 2021, 20))
        labels = [x for x in range(len(bins) - 1)]
        movies_df['year_level'] = pd.cut(movies_df['year'], bins, right=False, labels=labels)
        movies_df.drop('year', inplace=True, axis=1)


        threshold = 10
        over_threshold = ratings_df.groupby('movieId').size() >= threshold
        ratings_df['over_threshold'] = ratings_df['movieId'].map(lambda x: over_threshold[x])
        ratings_df = ratings_df[ratings_df["over_threshold"] == True]
        ratings_df.drop("over_threshold", axis=1, inplace=True)
        random_idx = np.random.permutation(len(ratings_df))
        shuffled_df = ratings_df.iloc[random_idx]
        shuffled_df.to_csv('shuffled_df.csv', index=False)
        concat_df = pd.concat([
            pd.get_dummies(shuffled_df['userId'], prefix="user"),
            pd.get_dummies(shuffled_df['movieId'], prefix="movie"),
            shuffled_df['movieId'].apply(lambda x: dummy_genre_df.loc[x]),
            shuffled_df['movieId'].apply(lambda x: movies_df.loc[x]["year_level"]).rename('year_level'),
        ], axis=1)

        target_df = ratings_df.loc[concat_df.index]['rating']
        target_df = target_df.apply(lambda x: 1 if x >= 4 else 0)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(concat_df, target_df, test_size=0.1)

if __name__ == "__main__":
    print(print("---dataloader---"))

In [None]:
import tensorflow as tf
import argparse
from tensorflow import keras


def parse_args():
    parser = argparse.ArgumentParser(description="NeuralMF.")
    parser.add_argument('--path', nargs='?', default='/dataset/',
                        help='Input data path.')
    parser.add_argument('--dataset', nargs='?', default='ratings.csv',
                        help='Choose a dataset.')
    parser.add_argument('--num_factors', type=int, default=8,help='latent feature of FM model.')
    parser.add_argument('--epochs', type=int, default=10,help='Number of epochs.')
    parser.add_argument('--batch_size', type=int, default=32,help='Batch size.')
    parser.add_argument('--lr', type=float, default=0.01,
                        help='Learning rate.')
    parser.add_argument('--learner', nargs='?', default='adam',
                        help='Specify an optimizer: adagrad, adam, rmsprop, sgd')
    return parser.parse_args()


class FM(keras.Model):
    def __init__(self, n_factor=8, **kwargs):
        super().__init__(**kwargs)

        self.w_0 = tf.Variable([0.0])
        self.w = tf.Variable(tf.zeros(shape=[p]))
        self.v = tf.Variable(tf.random.normal(shape=(p, n_factor)))

    def call(self,inputs):
        degree_1 = tf.reduce_sum(tf.multiply(self.w, inputs), axis=1)

        degree_2 = 0.5 * tf.reduce_sum(
            tf.math.pow(tf.matmul(inputs, self.v), 2)
            - tf.matmul(tf.math.pow(inputs, 2), tf.math.pow(self.v, 2))
            , 1
            , keepdims=False
        )

        predict = tf.math.sigmoid(self.w_0 + degree_1 + degree_2)

        return predict

def print_status_bar(iteration, total, loss, metrics = None):
    metrics = " - ".join([f"{m.name}: {m.result():.4f}"
                          for m in [loss] + (metrics or [])])
    end = "" if iteration < total else "\n"
    print(f"\r{iteration}/{total}  " + metrics ,
          end = end)

if __name__ == "__main__":

    '''
    args = parse_args()
    print(args)
    num_factors = args.num_factors
    learner = args.learner
    learning_rate = args.lr
    epochs = args.epochs
    batch_size = args.batch_size
    '''
    num_factors = 8
    learner = 'adam'
    learning_rate = 0.01
    epochs = 10
    batch_size = 32

    loader =dataloader('dataset')
    X_train =loader.X_train
    y_train = loader.y_train
    X_test = loader.X_test
    y_test = loader.y_test

    n = X_train.shape[0]
    p = X_train.shape[1]
    
    X_train = X_train.astype(np.float32)
    y_train = y_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_test = y_test.astype(np.float32)


    n_steps = len(X_train) // batch_size

    if learner.lower() == "adagrad":
        optimizer=keras.optimizers.Adagrad(lr=learning_rate)
    elif learner.lower() == "rmsprop":
        optimizer=keras.optimizers.RMSprop(lr=learning_rate)
    elif learner.lower() == "adam":
        optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate)
    else:
        optimizer = keras.optimizers.SGD(lr=learning_rate)

    loss_fn = keras.losses.binary_crossentropy
    mean_loss = keras.metrics.Mean()
    metrics = [keras.metrics.BinaryAccuracy()]
    test_acc = keras.metrics.BinaryAccuracy()

    model = FM(n_factor=num_factors)

    train_data = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_train, tf.float32), tf.cast(y_train, tf.float32))).shuffle(500).batch(batch_size)
    test_data = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_test, tf.float32), tf.cast(y_test, tf.float32))).shuffle(200).batch(batch_size)

    for epoch in range(epochs):
        print(f"에포크 : {epoch}/{epochs}")

        for step, (X_batch, y_batch) in enumerate(train_data):
            # train, test data
            with tf.GradientTape() as tape:
                predict = model(X_batch)
                loss = loss_fn(y_batch, predict)
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            mean_loss(loss)

            for metric in metrics:
                metric(y_batch, predict)

            print_status_bar(step * batch_size, len(y_train), mean_loss, metrics=metrics)

        for x_test, y_test in test_data:
            prediction = model(x_test)
            test_acc.update_state(y_test, prediction)

        print_status_bar(n_steps * batch_size, n_steps * batch_size, mean_loss, metrics=metrics)
        print("검증 정확도: ", test_acc.result().numpy())
        for metric in [mean_loss] + [test_acc] +metrics:
            metric.reset_states()

In [None]:
prediction = model(x_test)
print(prediction)

In [None]:
for i in prediction:
    print(round(i.numpy(), 1))