# **Collaborative Flitering**



Import Library

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

LOAD DATASET

In [None]:
# URL tempat dataset CSV disimpan
url = 'https://raw.githubusercontent.com/kucingku-capstone/machine-learning/main/dataset/cats_dataset_updated.csv'

# Membaca dataset dari URL dan menyimpannya dalam DataFrame df
df = pd.read_csv(url)

# Menampilkan beberapa baris pertama dari DataFrame untuk memahami struktur datanya
print(df.head())

   Unnamed: 0    cat_id                                                url  \
0           0  46744842  https://www.petfinder.com/cat/arianna-46744842...   
1           1  46717321  https://www.petfinder.com/cat/ozzy-46717321/oh...   
2           2  46626338  https://www.petfinder.com/cat/zena-46626338/ca...   
3           3  46620170  https://www.petfinder.com/cat/mika-46620170/ca...   
4           4  46582751  https://www.petfinder.com/cat/ruby-46582751/fl...   

  type     age  gender         size   coat       breed  \
0  Cat   Adult  Female       Medium  Short  Abyssinian   
1  Cat   Adult    Male  Extra Large  Short  Abyssinian   
2  Cat  Senior  Female       Medium  Short  Abyssinian   
3  Cat    Baby  Female        Small  Short  Abyssinian   
4  Cat   Young  Female       Medium  Short  Abyssinian   

                                              photos  \
0  [{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...   
1  [{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...   
2  [{'small': 

Data preprocessing

In [None]:
# Menyimpan nama kolom yang akan dihapus dalam sebuah list
columns_to_drop = ['Unnamed: 0', 'type', 'url', 'photos', 'med_photos']

# Menghapus kolom-kolom yang ada dalam list dari DataFrame
df_cleaned = df.drop(columns=columns_to_drop, axis=1)

# Menampilkan beberapa baris pertama dari DataFrame setelah penghapusan kolom
print(df_cleaned.head())

     cat_id     age  gender         size   coat       breed  user_id  rating
0  46744842   Adult  Female       Medium  Short  Abyssinian    11355    1.74
1  46717321   Adult    Male  Extra Large  Short  Abyssinian     2127    1.88
2  46626338  Senior  Female       Medium  Short  Abyssinian    14219    2.12
3  46620170    Baby  Female        Small  Short  Abyssinian     8356    3.47
4  46582751   Young  Female       Medium  Short  Abyssinian     8029    3.34


In [None]:
# Menjalankan train-test split
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Menampilkan jumlah baris dalam data latih
jumlah_data_latih = train.shape[0]
print(f"Jumlah baris dalam data latih: {jumlah_data_latih}")

# Menampilkan jumlah baris dalam data uji
jumlah_data_uji = test.shape[0]
print(f"Jumlah baris dalam data uji: {jumlah_data_uji}")


Jumlah baris dalam data latih: 56116
Jumlah baris dalam data uji: 14029


Data preparation

In [None]:
from sklearn.preprocessing import OneHotEncoder
import scipy.sparse as sp

# Encoding categorical data
categorical_columns = ['age', 'gender', 'size', 'coat', 'breed']
encoder = OneHotEncoder(sparse=True)  # Using sparse matrix
encoded_cats = encoder.fit_transform(df_cleaned[categorical_columns])

# Dropping the categorical columns from the original DataFrame
df_reduced = df_cleaned.drop(categorical_columns, axis=1)

# Creating a mapping for user_id and cat_id to a continuous range starting from 0
user_id_mapping = {id: i for i, id in enumerate(np.unique(df_reduced['user_id']))}
cat_id_mapping = {id: i for i, id in enumerate(np.unique(df_reduced['cat_id']))}

# Transforming user_id and cat_id to the new indices
user_ids = df_reduced['user_id'].map(user_id_mapping).values
cat_ids = df_reduced['cat_id'].map(cat_id_mapping).values
ratings = df_reduced['rating'].values

# Creating the interaction matrix (this will be a sparse matrix)
interaction_matrix = sp.coo_matrix((ratings, (user_ids, cat_ids)),
                                   shape=(len(user_id_mapping), len(cat_id_mapping)))

# Displaying the shape of the interaction matrix
print(interaction_matrix.shape)




(15000, 70096)


In [None]:
def build_model():
    user_id_input = Input(shape=(1,), name='user_id')
    cat_id_input = Input(shape=(1,), name='cat_id')
    age_input = Input(shape=(1,), name='age')
    gender_input = Input(shape=(1,), name='gender')
    size_input = Input(shape=(1,), name='size')
    coat_input = Input(shape=(1,), name='coat')

    embedding_size = 50
    user_embedding = Embedding(input_dim=df['user_id'].nunique(), output_dim=embedding_size)(user_id_input)
    cat_embedding = Embedding(input_dim=df['cat_id'].nunique(), output_dim=embedding_size)(cat_id_input)
    age_embedding = Embedding(input_dim=df['age'].nunique(), output_dim=embedding_size)(age_input)
    gender_embedding = Embedding(input_dim=df['gender'].nunique(), output_dim=embedding_size)(gender_input)
    size_embedding = Embedding(input_dim=df['size'].nunique(), output_dim=embedding_size)(size_input)
    coat_embedding = Embedding(input_dim=df['coat'].nunique(), output_dim=embedding_size)(coat_input)

    user_flat = Flatten()(user_embedding)
    cat_flat = Flatten()(cat_embedding)
    age_flat = Flatten()(age_embedding)
    gender_flat = Flatten()(gender_embedding)
    size_flat = Flatten()(size_embedding)
    coat_flat = Flatten()(coat_embedding)

    concatenated = Concatenate()([user_flat, cat_flat, age_flat, gender_flat, size_flat, coat_flat])

    dense_1 = Dense(128, activation='relu')(concatenated)
    dense_2 = Dense(64, activation='relu')(dense_1)
    output = Dense(1)(dense_2)

    model = Model(inputs=[user_id_input, cat_id_input, age_input, gender_input, size_input, coat_input], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')

    return model

# Membuat dan melatih model
model = build_model()
model.fit(
    [train['user_id'], train['cat_id'], train['age'], train['gender'], train['size'], train['coat']],
    train['cat_id'],
    epochs=10,
    batch_size=32,
    validation_data=(
        [test['user_id'], test['cat_id'], test['age'], test['gender'], test['size'], test['coat']],
        test['cat_id']
    )
)

# Membuat prediksi
predictions = model.predict(
    [test['user_id'], test['cat_id'], test['age'], test['gender'], test['size'], test['coat']]
)

Epoch 1/10


UnimplementedError: ignored

Punya taufik

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

# Step 1: Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/kucingku-capstone/machine-learning/main/dataset/cats_dataset_updated.csv')

# Step 2: Data preprocessing
user_encoder = LabelEncoder()
cat_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'].astype(str))
df['cat_id'] = cat_encoder.fit_transform(df['cat_id'].astype(str))

# Step 3: Create user-item matrix
ratings_matrix = df.pivot_table(index='user_id', columns='cat_id', values='rating')

# Step 4: Fill NaN values with 0
ratings_matrix_filled = ratings_matrix.fillna(0)

# Step 5: Calculate cosine similarity using a sparse matrix
ratings_matrix_sparse = csr_matrix(ratings_matrix_filled.values)
user_similarity = cosine_similarity(ratings_matrix_sparse, dense_output=False)

# Step 6: Make recommendations based on similarity
def get_user_recommendations(user_index, similarity_matrix, matrix, k=5):
    user_sim_scores = similarity_matrix.getrow(user_index)

    # Sum all similarity scores and ratings, then normalize
    sum_sim_scores = user_sim_scores.sum()
    weighted_ratings = matrix.T.dot(user_sim_scores.T) / sum_sim_scores

    # Sort by rating and take the top k
    recommendations = weighted_ratings.T.sort_values(by=0, ascending=False).head(k)
    top_cat_indices = recommendations.index
    top_cat_ids = cat_encoder.inverse_transform(top_cat_indices)

    return top_cat_ids

# Example of using the recommendation function
user_index_example = 0
recommended_cats = get_user_recommendations(user_index_example, user_similarity, ratings_matrix_sparse, k=5)

print(recommended_cats)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Concatenate, Dense
from tensorflow.keras.optimizers import Adam

# Step 1: Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/kucingku-capstone/machine-learning/main/dataset/cats_dataset_updated.csv')

# Step 2: Data preprocessing
user_encoder = LabelEncoder()
cat_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'].astype(str))
df['cat_id'] = cat_encoder.fit_transform(df['cat_id'].astype(str))

# Step 3: Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Step 4: Create the collaborative filtering model using TensorFlow
num_users = len(user_encoder.classes_)
num_cats = len(cat_encoder.classes_)
embedding_size = 50

# User embedding layer
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, input_length=1)(user_input)
user_embedding = Flatten()(user_embedding)

# Cat embedding layer
cat_input = Input(shape=(1,), name='cat_input')
cat_embedding = Embedding(input_dim=num_cats, output_dim=embedding_size, input_length=1)(cat_input)
cat_embedding = Flatten()(cat_embedding)

# Concatenate user and cat embeddings
concatenated = Concatenate()([user_embedding, cat_embedding])

# Add additional dense layers
dense_layer_1 = Dense(128, activation='relu')(concatenated)
dense_layer_2 = Dense(64, activation='relu')(dense_layer_1)

# Dot product of user and cat embeddings
dot_product = Dense(1, activation='linear')(dense_layer_2)

# Combine all layers into a model
model = Model(inputs=[user_input, cat_input], outputs=dot_product)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(
    x=[train_df['user_id'], train_df['cat_id']],
    y=train_df['rating'],
    epochs=15,
    batch_size=64,
    validation_split=0.2
)

# Step 5: Evaluate the model on the test set
test_loss = model.evaluate(x=[test_df['user_id'], test_df['cat_id']], y=test_df['rating'])
print(f'Test Loss: {test_loss}')

# Rest of the code for making predictions...
# Step 6: Make predictions for a specific user
user_index_example = 0
user_input_example = np.array([user_index_example])

# Provide a single cat index for each prediction
cat_indices = np.arange(num_cats)

# Reshape user input to have the same shape as cat_indices
user_input_example = np.repeat(user_input_example, num_cats)

predictions = model.predict([user_input_example, cat_indices])

# Get top recommendations
top_cat_indices = np.argsort(predictions.flatten())[::-1][:5]
top_cat_ids = cat_encoder.inverse_transform(top_cat_indices)

print(top_cat_ids)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.8953477740287781
['33554966' '50746347' '36109833' '40071478' '37364258']


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.optimizers import Adam

# Step 1: Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/kucingku-capstone/machine-learning/main/dataset/cats_dataset_updated.csv')

# Step 2: Data preprocessing
user_encoder = LabelEncoder()
cat_encoder = LabelEncoder()
gender_encoder = LabelEncoder()
coat_encoder = LabelEncoder()
size_encoder = LabelEncoder()
age_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'].astype(str))
df['cat_id'] = cat_encoder.fit_transform(df['cat_id'].astype(str))
df['gender'] = gender_encoder.fit_transform(df['gender'].astype(str))
df['coat'] = coat_encoder.fit_transform(df['coat'].astype(str))
df['size'] = size_encoder.fit_transform(df['size'].astype(str))
df['age'] = age_encoder.fit_transform(df['age'].astype(str))

# Step 3: Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Step 4: Create the collaborative filtering model using TensorFlow
num_users = len(user_encoder.classes_)
num_cats = len(cat_encoder.classes_)
num_genders = len(gender_encoder.classes_)
num_coats = len(coat_encoder.classes_)
num_sizes = len(size_encoder.classes_)
num_ages = len(age_encoder.classes_)

embedding_size = 50  # You can adjust this based on your preference

# Define input layers
user_input = Input(shape=(1,), name='user_input')
cat_input = Input(shape=(1,), name='cat_input')
gender_input = Input(shape=(1,), name='gender_input')
coat_input = Input(shape=(1,), name='coat_input')
size_input = Input(shape=(1,), name='size_input')
age_input = Input(shape=(1,), name='age_input')

# Define embedding layers
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, input_length=1)(user_input)
cat_embedding = Embedding(input_dim=num_cats, output_dim=embedding_size, input_length=1)(cat_input)
gender_embedding = Embedding(input_dim=num_genders, output_dim=embedding_size, input_length=1)(gender_input)
coat_embedding = Embedding(input_dim=num_coats, output_dim=embedding_size, input_length=1)(coat_input)
size_embedding = Embedding(input_dim=num_sizes, output_dim=embedding_size, input_length=1)(size_input)
age_embedding = Embedding(input_dim=num_ages, output_dim=embedding_size, input_length=1)(age_input)

# Flatten the embeddings
user_embedding = Flatten()(user_embedding)
cat_embedding = Flatten()(cat_embedding)
gender_embedding = Flatten()(gender_embedding)
coat_embedding = Flatten()(coat_embedding)
size_embedding = Flatten()(size_embedding)
age_embedding = Flatten()(age_embedding)

# Concatenate all embeddings
concatenated = Concatenate()([user_embedding, cat_embedding, gender_embedding, coat_embedding, size_embedding, age_embedding])

# Add additional dense layers
dense_layer_1 = Dense(128, activation='relu')(concatenated)
dense_layer_2 = Dense(64, activation='relu')(dense_layer_1)
output = Dense(1)(dense_layer_2)

# Combine all layers into a model
model = Model(inputs=[user_input, cat_input, gender_input, coat_input, size_input, age_input], outputs=output)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(
    x=[train_df['user_id'], train_df['cat_id'], train_df['gender'], train_df['coat'], train_df['size'], train_df['age']],
    y=train_df['rating'],
    epochs=10,
    batch_size=64,
    validation_split=0.2
)

# Evaluate the model on the test set
test_loss = model.evaluate(
    x=[test_df['user_id'], test_df['cat_id'], test_df['gender'], test_df['coat'], test_df['size'], test_df['age']],
    y=test_df['rating']
)

print(f'Test Loss: {test_loss}')

# Make predictions for a specific user
user_index_example = 0
user_input_example = np.array([user_index_example] * num_cats)

# Reshape input arrays for prediction
user_input_example = np.reshape(user_input_example, (num_cats, 1))
# The rest of the arrays remain the same as they were

predictions = model.predict([user_input_example, cat_indices, gender_indices, coat_indices, size_indices, age_indices])

# Continue with the rest of your code...

# Get top recommendations
top_cat_indices = np.argsort(predictions.flatten())[::-1][:5]
top_cat_ids = cat_encoder.inverse_transform(top_cat_indices)

print(top_cat_ids)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.8949368000030518
['46075255' '46526701' '46289782' '45673395' '46405585']


Save model H5

In [10]:
if __name__ == '__main__':
    # DO NOT CHANGE THIS CODE
    model = solution_B2()
    model.save("model_collaborativefiltering_cat.h5")

NameError: ignored

#**Referensi TFRS**

In [None]:
!pip install tensorflow_recommenders

Collecting tensorflow_recommenders
  Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl (96 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_recommenders
Successfully installed tensorflow_recommenders-0.7.3


In [None]:
from typing import Dict, Text

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

# Ratings data.
ratings = tfds.load('movielens/100k-ratings', split="train")
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split="train")

# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_id": tf.strings.to_number(x["movie_id"]),
    "user_id": tf.strings.to_number(x["user_id"])
})
movies = movies.map(lambda x: tf.strings.to_number(x["movie_id"]))

# Build a model.
class Model(tfrs.Model):

  def __init__(self):
    super().__init__()

    # Set up user representation.
    self.user_model = tf.keras.layers.Embedding(
        input_dim=2000, output_dim=64)
    # Set up movie representation.
    self.item_model = tf.keras.layers.Embedding(
        input_dim=2000, output_dim=64)
    # Set up a retrieval task and evaluation metrics over the
    # entire dataset of candidates.
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.item_model)
        )
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.item_model(features["movie_id"])

    return self.task(user_embeddings, movie_embeddings)


model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# Train.
model.fit(train.batch(4096), epochs=5)

# Evaluate.
model.evaluate(test.batch(4096), return_dict=True)

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-ratings/0.1.1.incompleteZ7TR30/movielens-train.tfrecord*...…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.
Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-movies/0.1.1.incompleteNVAB21/movielens-train.tfrecord*...:…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.1. Subsequent calls will reuse this data.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.0003000000142492354,
 'factorized_top_k/top_10_categorical_accuracy': 0.0015999999595806003,
 'factorized_top_k/top_50_categorical_accuracy': 0.05635000020265579,
 'factorized_top_k/top_100_categorical_accuracy': 0.1514499932527542,
 'loss': 30158.78125,
 'regularization_loss': 0,
 'total_loss': 30158.78125}