In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import pandas as pd
import pathlib

In [None]:
import tensorflow as tf

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OrdinalEncoder

In [None]:
!pip install fastapi kaleido python-multipart uvicorn -q
!pip install pyarrow==14.0.0 --force-reinstall -q
!pip install transformers datasets evaluate accelerate -q

In [None]:
from datasets import Dataset, Features, ClassLabel, Value

## Get the Data

In [None]:
path_emoji = pathlib.Path('/content/drive/MyDrive/data/Emoji_Prediction/emoji.txt')
path_tweets = pathlib.Path('/content/drive/MyDrive/data/Emoji_Prediction/tweets.txt')

with open(path_emoji.__str__(), 'r', encoding='utf-8') as file:
    lines = [elem.replace('\n', '') for elem in file.readlines()]

emoji_df = pd.DataFrame(lines, columns=['Tweets'])

with open(path_tweets.__str__(), 'r', encoding='utf-8') as file:
    lines = [elem.replace('\n', '') for elem in file.readlines()]

tweets_df = pd.DataFrame(lines, columns=['Tweets'])

In [None]:
df = pd.merge(tweets_df, emoji_df, left_index=True, right_index=True).rename(columns={'Tweets_x': 'text', 'Tweets_y': 'label'})
df.head()

In [None]:
encoder = OrdinalEncoder()
df['label'] = encoder.fit_transform(df[['label']])
df.head()

In [None]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,
    random_state=42
)
df_train = pd.merge(train_sentences, train_labels, left_index=True, right_index=True)
df_test = pd.merge(test_sentences, test_labels, left_index=True, right_index=True)

In [None]:
df_train

In [None]:
features = Features({
    'text': Value('string'),
    'label': ClassLabel(num_classes=10, names=[f'class_{i}' for i in range(10)])
})

train_dataset = Dataset.from_pandas(df_train[['text', 'label']].reset_index(drop=True), split='train')
test_dataset = Dataset.from_pandas(df_test[['text', 'label']].reset_index(drop=True), split='test')

In [None]:
id2label = {0: 'blush', 1: 'flushed', 2: 'grin', 3: 'heart_eyes', 4: 'relaxed', 5: 'smirk',
       6: 'sob', 7: 'weary', 8: 'wink', 9: 'yum'}
label2id = {'blush': 0, 'flushed': 1, 'grin': 2, 'heart_eyes': 3, 'relaxed': 4, 'smirk': 5,
       'sob': 6, 'weary': 7, 'wink': 8, 'yum': 9}

### Evaluate

In [None]:
accuracy = evaluate.load("accuracy")

### Main

In [None]:
BATCH_SIZE = 32
NUM_EPOCHS = 10

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
tf_train_dataset = tokenized_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
).cache().prefetch(tf.data.experimental.AUTOTUNE)

tf_validation_dataset = tokenized_test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
).cache().prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
from transformers import create_optimizer


batches_per_epoch = len(tokenized_train) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * NUM_EPOCHS)
optimizer, schedule = create_optimizer(init_lr=1e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=10)

In [None]:
import tensorflow as tf

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=optimizer,
              metrics=['accuracy'])

In [None]:
history = model.fit(x=tf_train_dataset, validation_data=tf_validation_dataset, epochs=NUM_EPOCHS)

In [None]:
model.save_pretrained('/content/drive/MyDrive/data/Emoji_Prediction/my_fine_tuned_model')
tokenizer.save_pretrained('/content/drive/MyDrive/data/Emoji_Prediction/my_fine_tuned_model')