In [1]:
##########################################################################
# Program Name : train_model_pandas.ipynb
# Purpose : train a model
# Kaggle Dataset Source : obertvici/indonesia-top-ecommerce-unicorn-tweets
# Location of Dataset Loaded : Local File System
# Data Processsing Tools: pandas
###########################################################################

import subprocess
import os
import json
import pandas as pd
import tensorflow as tf
import numpy as np
from keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

def run_command(command):
    """Utility function to run shell commands"""
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    if process.returncode == 0:
        print(f"Success: {command}\nOutput:\n{stdout.decode()}")
    else:
        print(f"Error: {command}\nError Message:\n{stderr.decode()}")


# Create directories
print("Creating directories...")
dirs = [
    "kaggle/datasets",
    "kaggle/splits"
]

for dir in dirs:
    print(f"Creating directory: {dirs}")
    run_command(f"mkdir -p ../{dir}")

# Download dataset from Kaggle
print("Downloading dataset from Kaggle...")
kaggle_dataset_path = "../kaggle/datasets"
dataset_name = "indonesia-top-ecommerce-unicorn-tweets"
run_command(f"kaggle datasets download -d robertvici/{dataset_name} -p {kaggle_dataset_path}")

# Unzip the downloaded dataset
print("Unziping the downloaded dataset...")
zip_file_path = f"{kaggle_dataset_path}/{dataset_name}.zip"
run_command(f"unzip -o {zip_file_path} -d {kaggle_dataset_path}")

# Remove indonesia-top-ecommerce-unicorn-tweets.zip
print("Remowing indonesia-top-ecommerce-unicorn-tweets.zip file...")
run_command(f"rm {zip_file_path}")

# Load datasets with Pandas
print("Loading datasets with Pandas...")
blibli_df = pd.read_json(f'{kaggle_dataset_path}/bliblidotcom.json', lines=True)
bukalapak_df = pd.read_json(f'{kaggle_dataset_path}/bukalapak.json', lines=True)
lazadaID_df = pd.read_json(f'{kaggle_dataset_path}/lazadaID.json', lines=True)
shopeeID_df = pd.read_json(f'{kaggle_dataset_path}/ShopeeID.json', lines=True)
tokopedia_df = pd.read_json(f'{kaggle_dataset_path}/tokopedia.json', lines=True)

# Add a new column to identify the company source
print("Adding a new column to identify the company source...")
blibli_df['source'] = 'blibli'
bukalapak_df['source'] = 'bukalapak'
lazadaID_df['source'] = 'lazadaID'
shopeeID_df['source'] = 'shopeeID'
tokopedia_df['source'] = 'tokopedia'

# Merge datasets using concat (equivalent to union in Spark)
print("Merging datasets using concat (equivalent to union in Spark)...")
merged_df = pd.concat([blibli_df, bukalapak_df, lazadaID_df, shopeeID_df, tokopedia_df], axis=0)

# Clean tweet text
print("Cleaning tweet tect")
def clean_text(text):
    return text.lower().replace("#", "").strip()

merged_df['clean_tweet'] = merged_df['tweet'].apply(clean_text)

# Create new feature for engagement
print("Creating  feature for engagement...")
merged_df['engagement'] = merged_df['replies_count'] + merged_df['retweets_count'] + merged_df['likes_count']

# Select relevant features
print("Selecting relevant features...")
selected_data = merged_df[['clean_tweet', 'replies_count', 'retweets_count', 'likes_count', 'engagement', 'hashtags', 'source']]

# Split dataset into train, validate, and test
print("Splitting dataset into train, validate, and test")
splits_dataset_path = "../kaggle/splits"
train_data = selected_data.sample(frac=0.7, random_state=42)
remaining_data = selected_data.drop(train_data.index)
validate_data = remaining_data.sample(frac=0.5, random_state=42)
test_data = remaining_data.drop(validate_data.index)

# Replace null values with 0
print("Replacing null values with 0...")
merged_df.fillna({"likes_count": 0, "replies_count": 0, "retweets_count": 0}, inplace=True)

# Replace negative values with 0
print("Replacing negative values with 0...")
for col in ["likes_count", "replies_count", "retweets_count"]:
    merged_df[col] = merged_df[col].apply(lambda x: max(0, x))

# Tokenize and vectorize text
print ("Tokenize and vectorize text...")
tokenizer = Tokenizer(num_words=5000)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_data["clean_tweet"].values)

# Convert texts to sequences
print("Converting texts to sequences...")
X_train = tokenizer.texts_to_sequences(train_data["clean_tweet"].values)

# Pad the sequences to ensure uniform length
print("Pad the sequences to ensure uniform length...")
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post')

y_train = train_data["engagement"].values

# Define a simple Neural Network model
print("Define a simple Neural Network model...")
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(1, activation="linear")
])

# Check Mxx index in X-train
print("Max index in X_train:", X_train.max())
print("Shape of X_train:", X_train.shape)

# Filter / set index maximum to 5000
X_train[X_train >= 5000] = 0

# Compile the model
print("Compiling the model...")
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Train the model
print("Training the model...")
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Save the model
print("Saving the model...")
model.save("../models/e-commerce-engagement-model.keras")

# Save tokenizer for future use
print("Saving tokenizer for future use...")
with open('../models/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Export to save model
model.export("../saved_model/1")
print("Final Model ==> saved_model/1")

print("Keras Model, Tokenizer and Final Model are saved successfully!")

# Remove datasets on local file system
print("Removing local dataset files...")
run_command(f"rm -r ../kaggle")

print("")
print("All tasks completed successfully!")

2025-01-21 08:11:34.773616: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-21 08:11:34.775891: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-21 08:11:34.788231: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-21 08:11:34.827567: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737421894.893442    5348 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737421894.91

Creating directories...
Creating directory: ['kaggle/datasets', 'kaggle/splits']
Success: mkdir -p ../kaggle/datasets
Output:

Creating directory: ['kaggle/datasets', 'kaggle/splits']
Success: mkdir -p ../kaggle/splits
Output:

Downloading dataset from Kaggle...
Success: kaggle datasets download -d robertvici/indonesia-top-ecommerce-unicorn-tweets -p ../kaggle/datasets
Output:
Dataset URL: https://www.kaggle.com/datasets/robertvici/indonesia-top-ecommerce-unicorn-tweets
License(s): copyright-authors
Downloading indonesia-top-ecommerce-unicorn-tweets.zip to ../kaggle/datasets


Unziping the downloaded dataset...
Success: unzip -o ../kaggle/datasets/indonesia-top-ecommerce-unicorn-tweets.zip -d ../kaggle/datasets
Output:
Archive:  ../kaggle/datasets/indonesia-top-ecommerce-unicorn-tweets.zip
  inflating: ../kaggle/datasets/ShopeeID.json  
  inflating: ../kaggle/datasets/bliblidotcom.json  
  inflating: ../kaggle/datasets/bukalapak.json  
  inflating: ../kaggle/datasets/lazadaID.json  
  

2025-01-21 08:14:14.807874: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Training the model...
Epoch 1/10
[1m11839/11839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 16ms/step - loss: 603710.3750 - mae: 27.7081
Epoch 2/10
[1m11839/11839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 16ms/step - loss: 616781.5625 - mae: 32.2200
Epoch 3/10
[1m11839/11839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 16ms/step - loss: 695101.0625 - mae: 34.8891
Epoch 4/10
[1m11839/11839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 16ms/step - loss: 575732.6250 - mae: 28.6435
Epoch 5/10
[1m11839/11839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 16ms/step - loss: 658079.4375 - mae: 30.2743
Epoch 6/10
[1m11839/11839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 16ms/step - loss: 652285.6875 - mae: 30.9110
Epoch 7/10
[1m11839/11839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 16ms/step - loss: 537915.9375 - mae: 29.6280
Epoch 8/10
[1m11839/11839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m

INFO:tensorflow:Assets written to: ../saved_model/1/assets


Saved artifact at '../saved_model/1'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 68), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  125337784097232: TensorSpec(shape=(), dtype=tf.resource, name=None)
  125337784097424: TensorSpec(shape=(), dtype=tf.resource, name=None)
  125337784098576: TensorSpec(shape=(), dtype=tf.resource, name=None)
  125337784097616: TensorSpec(shape=(), dtype=tf.resource, name=None)
  125337784099920: TensorSpec(shape=(), dtype=tf.resource, name=None)
Final Model ==> saved_model/1
Keras Model, Tokenizer and Final Model are saved successfully!
Removing local dataset files...
Success: rm -r ../kaggle
Output:


All tasks completed successfully!
