In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# Load your dataset
df = pd.read_csv('labelled_data.csv')

# Prepare text data
texts = df['Comment Text']
labels = df['Sentiment Label']

# Map the sentiment labels to numerical values
label_mapping = {'NEG': 0, 'NEU': 1, 'POS': 2}
labels = labels.map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = TFRobertaForSequenceClassification.from_pretrained("roberta-base")

# Encode the text data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    list(y_train)
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    list(y_test)
)).batch(16)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=model.compute_loss, 
              metrics=['accuracy'])

# Train the model with progress bar and plot learning curves
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset)

# Plot learning curves
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy over epochs')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over epochs')

plt.show()

# Predict and evaluate
y_pred = np.argmax(model.predict(test_dataset).logits, axis=1)
print(classification_report(y_test, y_pred))





tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Epoch 1/10
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported



AttributeError: in user code:

    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step  *
        outputs = model.train_step(data)
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_tf_utils.py", line 1706, in train_step  *
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\tf_keras\src\engine\compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\tf_keras\src\losses.py", line 143, in __call__  *
        losses = call_fn(y_true, y_pred)
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\tf_keras\src\losses.py", line 270, in call  *
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_tf_utils.py", line 1588, in compute_loss  *
        return super().compute_loss(*args, **kwargs)
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\tf_keras\src\engine\training.py", line 1207, in compute_loss  *
        y, y_pred, sample_weight, regularization_losses=self.losses
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\tf_keras\src\engine\compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "c:\Users\Chummy\AppData\Local\Programs\Python\Python39\lib\site-packages\tf_keras\src\engine\compile_utils.py", line 854, in match_dtype_and_rank  *
        if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (

    AttributeError: 'NoneType' object has no attribute 'dtype'
