# 🤖 Twemoji Classifier – CS3244 AY24/25 Sem 2

**Group Members:**  
- Jason Matthew Suhari  
- Bryan Castorius Halim  
- Nigel Eng Wee Kiat  
- Muhammad Salman Al Farisi  
- Ng Jia Hao Sherwin  
- Ryan Justyn

This notebook builds and evaluates baseline models for classifying tweets into emojis using the Twemoji dataset. It's the main entry point!

### 1. Imports and Setup 📩

#### 1.1 Utility Imports

In [25]:
# Imports
import os
import numpy as np
import pandas as pd
import joblib
import requests
from pathlib import Path
from tensorflow.keras.models import load_model as keras_load_model
import pandas as pd
import numpy as np
import os
from IPython.display import HTML, display
import matplotlib.pyplot as plt

#### 1.2 ML Imports

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

### 2. S3 Bucket Data Loading 🪣

In [21]:
# all of the s3 item urls are kept in urls.txt, maybe we should wrap this into a function in case it isnt
with open("urls.txt", "r") as f:
    urls = [line.strip() for line in f.readlines() if line.strip()]


loaded_data = {}
for url in urls:
    filename = os.path.basename(url)
    filepath = os.path.join("data", filename)

    if not os.path.exists(filepath):
        print(f"Downloading {filename} from the s3 bucket...")
        response = requests.get(url)
        if response.status_code == 200:
            with open(filepath, "wb") as f:
                f.write(response.content)
        else:
            print(f"Failed to download {filename}: {response.status_code}")
            continue
    else:
        print(f"{filename} already exists. Skipping download.")

    try:
        if filename.endswith(".npy"):
            loaded_data[filename] = np.load(filepath, allow_pickle=True)
        elif filename.endswith(".csv"):
            loaded_data[filename] = pd.read_csv(filepath)
        else:
            print(f"Unsupported file type: {filename}")
    except Exception as e:
        print(f"Failed to load {filename}: {e}")


test_bert_embeddings.npy already exists. Skipping download.
train_bert_embeddings.npy already exists. Skipping download.
valid_bert_embeddings.npy already exists. Skipping download.
train_with_bert_embeddings.csv already exists. Skipping download.
valid_with_bert_embeddings.csv already exists. Skipping download.
test_with_bert_embeddings.csv already exists. Skipping download.


### 3. Exploratory Data Analysis (EDA) 🔍
Full EDA code can be found in the eda.ipynb folder. To save space in the main notebook, we have collated just the figures.

In [None]:
import os
from IPython.display import display, HTML

plot_folder = "plots/"
image_files = sorted([
    f for f in os.listdir(plot_folder)
    if f.lower().endswith((".png", ".jpg", ".jpeg"))
])

html_str = """
<style>
  body {
    margin: 0;
    padding: 0;
  }

  .grid-wrapper {
    padding: 40px 20px 60px 20px;
  }

  .grid-container {
    display: grid;
    grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
    gap: 16px;
  }

  .grid-item {
    text-align: center;
  }

  .grid-item img {
    width: 100%;
    height: auto;
    border-radius: 8px;
    cursor: pointer;
    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
    transition: transform 0.2s ease-in-out;
  }

  .grid-item img:hover {
    transform: scale(1.05);
  }

  dialog::backdrop {
    background: rgba(0, 0, 0, 0.9);
  }

  dialog {
    border: none;
    background: transparent;
    padding: 0;
    margin: auto;
    z-index: 9999;
  }

  dialog img {
    max-width: 90vw;
    max-height: 90vh;
    display: block;
    margin: auto;
    border-radius: 10px;
    box-shadow: 0 6px 20px rgba(0,0,0,0.4);
  }
</style>

<div class="grid-wrapper">
  <div class="grid-container">
"""

for idx, image in enumerate(image_files):
    img_path = os.path.join(plot_folder, image).replace("\\", "/")
    html_str += f"""
    <div class="grid-item">
      <img src="{img_path}" onclick="document.getElementById('dialog{idx}').showModal()">
      <p style="font-size: 14px;">{image}</p>
    </div>
    <dialog id="dialog{idx}" onclick="this.close()">
      <img src="{img_path}" alt="{image}">
    </dialog>
    """

html_str += """
  </div>
</div>

<script>
  // Optional: ESC to close the dialog
  document.addEventListener("keydown", function(event) {
    if (event.key === "Escape") {
      document.querySelectorAll("dialog[open]").forEach(d => d.close());
    }
  });
</script>
"""

display(HTML(html_str))

#TODO: fix the weird clipping that happens when u open up one of the images


### 4. Preprocessing ⚙️

todo: someone pls add the preproc code here

### 5. Baseline Model Training 🏋️

Before we do any of the model training, probably best to explcitly define the data based on train-test-valid, instead of relying on just loaded_data.

In [33]:
train_df = loaded_data['train_with_bert_embeddings.csv']
valid_df = loaded_data['valid_with_bert_embeddings.csv']
test_df  = loaded_data['test_with_bert_embeddings.csv']

X_train_full = loaded_data['train_bert_embeddings.npy']
X_valid_full = loaded_data['valid_bert_embeddings.npy']
X_test_full  = loaded_data['test_bert_embeddings.npy']

X_train = np.array([X_train_full[i] for i in train_df['embedding_index']])
y_train = train_df['label'].values

X_valid = np.array([X_valid_full[i] for i in valid_df['embedding_index']])
y_valid = valid_df['label'].values

X_test = np.array([X_test_full[i] for i in test_df['embedding_index']])
y_test = test_df['label'].values


##### 5.1 Random Forest

In [None]:
# insert code for random forest model here
rf_model = None

##### 5.2 Support Vector Machine

In [None]:
# insert code for svm model here
svm_model = None

##### 5.3 Simple Neural Network

In [None]:
# insert code for nn model here
snn_model = None

##### 5.4 Convolutional Neural Network

In [None]:
# insert code for cnn model here
cnn_model = None

##### 5.5 Logistic Regression

In [None]:
logistic_regression_model = LogisticRegression(max_iter=1000, class_weight='balanced')
logistic_regression_model.fit(X_train_scaled, y_train)


📊 Validation Accuracy: 0.4344190907507403
              precision    recall  f1-score   support

           0       0.57      0.55      0.56      4137
           1       0.38      0.32      0.34      3493
           2       0.35      0.33      0.34      2919
           3       0.45      0.50      0.47      3671
           4       0.38      0.43      0.41      3003

    accuracy                           0.43     17223
   macro avg       0.42      0.43      0.42     17223
weighted avg       0.43      0.43      0.43     17223

🧪 Test Accuracy: 0.4321045537127117
              precision    recall  f1-score   support

           0       0.55      0.55      0.55      3934
           1       0.38      0.31      0.34      3546
           2       0.35      0.35      0.35      2911
           3       0.46      0.51      0.48      3619
           4       0.38      0.41      0.40      3053

    accuracy                           0.43     17063
   macro avg       0.42      0.43      0.42     17063

##### 5.6 Image-Based Classifier

In [None]:
# insert code for image-based model here
ib_classifier_model = None

### 6. Alternative Models 🤷‍♂️

### 7. Model Caching 🌚

### 8. Reloading Models 💡

In [None]:
def load_all_models(model_dir="models/"):
    """
    Load all .pkl (joblib) and .h5 (Keras) models from a given directory.

    Returns:
        dict: { "model_name": model_object }
    """
    model_dir = Path(model_dir)
    models = {}
    for model_path in model_dir.glob("*"):
        name = model_path.stem #todo: make sure that all of the models are named based on what they are e.g. logistic_regression.pkl
        if model_path.suffix == ".pkl":
            model = joblib.load(model_path)
            models[name] = {"model": model, "is_keras": False}

        elif model_path.suffix == ".h5":
            model = keras_load_model(model_path)
            models[name] = {"model": model, "is_keras": True}

        else:
            print(f"Not a model file: {model_path.name}")

    return models

In [None]:
all_models = load_all_models("models/")
for name, model_entry in all_models.items():
    print(f"Loaded {name}: IsKeras = {model_entry['is_keras']}")

### 9. Model Evaluations and Comparisons 🤔