In [None]:
pip install transformers

In [None]:
pip install datasets

In [4]:
from datasets import list_datasets, load_dataset, DatasetDict
import pandas as pd
import matplotlib.pyplot as plt
import torch
import numpy as np

In [5]:
train_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/SPOTIFY-CLASSIF/spotify_train.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
test_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/SPOTIFY-CLASSIF/spotify_test.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
validation_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/SPOTIFY-CLASSIF/spotify_validation.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
train_dataset = train_dataset["train"]
test_dataset = test_dataset["train"]
validation_dataset = validation_dataset["train"]

In [9]:
train_dataset, test_dataset, validation_dataset

(Dataset({
     features: ['Unnamed: 0', 'Review', 'Rating'],
     num_rows: 39284
 }),
 Dataset({
     features: ['Unnamed: 0', 'Review', 'Rating'],
     num_rows: 12277
 }),
 Dataset({
     features: ['Unnamed: 0', 'Review', 'Rating'],
     num_rows: 9821
 }))

In [10]:
dataset = DatasetDict({
    "train" : train_dataset,
    "test" : test_dataset,
    "validation" : validation_dataset
})

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Review', 'Rating'],
        num_rows: 39284
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Review', 'Rating'],
        num_rows: 12277
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'Review', 'Rating'],
        num_rows: 9821
    })
})

In [12]:
dataset = dataset.map(lambda ds : {
                      "Review" : ds["Review"],
                      "Rating" : ds["Rating"]},
                      remove_columns=['Unnamed: 0']
                      )

Map:   0%|          | 0/39284 [00:00<?, ? examples/s]

Map:   0%|          | 0/12277 [00:00<?, ? examples/s]

Map:   0%|          | 0/9821 [00:00<?, ? examples/s]

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Review', 'Rating'],
        num_rows: 39284
    })
    test: Dataset({
        features: ['Review', 'Rating'],
        num_rows: 12277
    })
    validation: Dataset({
        features: ['Review', 'Rating'],
        num_rows: 9821
    })
})

In [13]:
set(dataset["train"]["Rating"])

{0, 1, 2, 3, 4}

In [14]:
from transformers import AutoTokenizer

In [16]:
from transformers import TFAutoModelForSequenceClassification

In [17]:
model = "distilbert-base-uncased"

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
def tokenize_with_error_handling(batch):
    try:
        return tokenizer(batch["Review"], padding=True, truncation=True)
    except Exception as e:
        print(f"Error with batch: {batch}")
        raise e


In [20]:
dataset_encoded = dataset.map(tokenize_with_error_handling, batched=True, batch_size=None)

Map:   0%|          | 0/39284 [00:00<?, ? examples/s]

Map:   0%|          | 0/12277 [00:00<?, ? examples/s]

Map:   0%|          | 0/9821 [00:00<?, ? examples/s]

In [21]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['Review', 'Rating', 'input_ids', 'attention_mask'],
        num_rows: 39284
    })
    test: Dataset({
        features: ['Review', 'Rating', 'input_ids', 'attention_mask'],
        num_rows: 12277
    })
    validation: Dataset({
        features: ['Review', 'Rating', 'input_ids', 'attention_mask'],
        num_rows: 9821
    })
})

In [44]:
import tensorflow as tf

In [45]:
tokenizer_columns = tokenizer.model_input_names
tokenizer_columns

['input_ids', 'attention_mask']

In [46]:
tf_train_dataset = dataset_encoded["train"].to_tf_dataset(columns=tokenizer_columns,
                                                          label_cols="Rating",
                                                          shuffle=True,
                                                          batch_size=64)

In [47]:
tf_test_dataset = dataset_encoded["test"].to_tf_dataset(columns=tokenizer_columns,
                                                          label_cols="Rating",
                                                          shuffle=True,
                                                          batch_size=64)

In [48]:
tf_validation_dataset = dataset_encoded["validation"].to_tf_dataset(columns=tokenizer_columns,
                                                          label_cols="Rating",
                                                          shuffle=True,
                                                          batch_size=64)

In [62]:
for batch in tf_train_dataset.take(3):
    example_inputs = batch[0]
    example_labels = batch[1]
    print("Inputs:", example_inputs['input_ids'][0].numpy())
    print("MASK:", example_inputs['attention_mask'][0].numpy())
    print("Label:", example_labels[0].numpy())

Inputs: [  101  1045  5223  2129  3962  8757  6719  3248  2296  2060  2299  2084
  1996  2299  1045  2215  1998  2036  2339  2069 13558  2015  2023  2965
  1045  2031  2000  4952  2000  1996  2972  2299  1045  2123  2102  2130
  2215  2000  4952  2000  2017  2191  2009  2061 16801  1998 15703  1045
  4299  2009  2001  2066  1037  3025  2544  2073  2017  4508  2377  1998
  1996  2299  2017  2215  3310  2039   102     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0 

In [66]:
num_labels = len(set(dataset["train"]["Rating"]))
num_labels

5

In [69]:
tf_model = (TFAutoModelForSequenceClassification.from_pretrained(model, num_labels=num_labels))

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [70]:
tf_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=tf.metrics.SparseCategoricalAccuracy())

In [71]:
tf_model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,957,317
Trainable params: 66,957,317
Non-trainable params: 0
_________________________________________________________________


In [72]:
tf_model.fit(tf_train_dataset,
             validation_data=tf_validation_dataset,
             epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f7e34a3faf0>

In [75]:
y_predictions = tf_model.predict(tf_test_dataset)



In [76]:
y_logits = y_predictions.logits

In [77]:
y_logits

array([[-3.7213724 , -3.9339345 ,  0.21903199,  4.9218326 ,  0.24047054],
       [ 3.9278717 ,  2.9247496 , -3.0435607 , -3.7256243 , -2.3649197 ],
       [-1.7134072 , -3.843826  , -2.158249  , -0.3243039 ,  5.5616336 ],
       ...,
       [-3.9180384 , -4.2455378 ,  0.6913402 ,  5.671747  , -0.6879726 ],
       [-3.1156402 , -4.427724  , -1.651681  ,  1.8184496 ,  4.8120594 ],
       [-1.6694177 ,  3.8083298 ,  2.799943  , -2.284241  , -3.561364  ]],
      dtype=float32)

In [78]:
y_preds = y_logits.argmax(axis=1)

In [79]:
y_preds

array([3, 0, 4, ..., 3, 4, 1])

In [80]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [100]:
def plot_confusion_matrix(y_preds, y_true):
  cm = confusion_matrix(y_true, y_preds, normalize="true")

  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm)
  disp.plot(cmap='Blues', values_format=".2f", ax=ax, colorbar=False)
  ax.invert_yaxis()
  plt.title("Normalized Confusion Matrix")
  plt.show()

In [96]:
labels = [0,1,2,3,4]

In [97]:
y_true = []
for batch in tf_test_dataset:
    y_true.extend(batch[1].numpy())
y_true = np.array(y_true)
y_true

array([3, 0, 0, ..., 4, 4, 2])

In [102]:
cm = confusion_matrix(y_true, y_preds, normalize="true")

In [103]:
cm

array([[0.25070822, 0.14107649, 0.13342776, 0.13852691, 0.33626062],
       [0.26893676, 0.1362057 , 0.15010424, 0.11327311, 0.33148019],
       [0.281471  , 0.15346535, 0.13932107, 0.11527581, 0.31046676],
       [0.26251605, 0.15725289, 0.13478819, 0.12323492, 0.32220796],
       [0.25876384, 0.14483395, 0.13307196, 0.13284133, 0.33048893]])