In [27]:
#!pip install datasets
#!pip install plotly
#!pip install nbformat
#!pip install imbalanced-learn
!pip install transformers

Collecting transformers
  Downloading transformers-4.37.1-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.37.1-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m492.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading safetensors-0.4.2-cp311-cp311-macosx_11_0_arm64.whl (393 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.4/393.4 kB[0m [31m479.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tokenizers-0.15.1-cp311-cp311-macosx_11_0_arm64.whl (2.5 MB)
[2K   [90m

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
import panel as pn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
import warnings; warnings.filterwarnings('ignore')

In [3]:
validation = pd.read_csv('./validation.csv')
train = pd.read_csv('./training.csv')
test = pd.read_csv('./test.csv')
print(train.head())

                                                text  label
0                            i didnt feel humiliated      0
1  i can go from feeling so hopeless to so damned...      0
2   im grabbing a minute to post i feel greedy wrong      3
3  i am ever feeling nostalgic about the fireplac...      2
4                               i am feeling grouchy      3


In [4]:
print('Dataset information:')
print(f'Training data: {train.shape}')
print(f'Validation data: {validation.shape}')
print(f'Test data: {test.shape}')

Dataset information:
Training data: (16000, 2)
Validation data: (2000, 2)
Test data: (2000, 2)


In [14]:
from datasets import Dataset,DatasetDict,Features,Value,ClassLabel

# Don't forget the class label data
class_names = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
ft = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

# Combine Multiple Datasets 
emotions = DatasetDict({
    "train": Dataset.from_pandas(train,features=ft),
    "test": Dataset.from_pandas(test,features=ft),
    "validation": Dataset.from_pandas(validation,features=ft)
    })

# Convert a single DataFrame to a Dataset
# emotions = Dataset.from_pandas(train,features=ft)

emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [15]:
emotions.set_format(type="pandas")
df = emotions["train"][:]

In [16]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)


In [18]:
df

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger
...,...,...,...
15995,i just had a very brief time in the beanbag an...,0,sadness
15996,i am now turning and i feel pathetic that i am...,0,sadness
15997,i feel strong and good overall,1,joy
15998,i feel like this was such a rude comment and i...,3,anger


In [24]:
import plotly.express as px

px.bar(df['label_name'].value_counts(ascending=True),template='plotly_white')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [32]:
#Tokenizer
from transformers import AutoTokenizer
# Load parameters of the tokeniser
#model_ckpt = "distilbert-base-uncased"
#tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def preprocess_function(examples):
    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, padding=True, truncation=True)
    labels = examples['label']
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = emotions.map(preprocess_function, batched=True)

Map: 100%|██████████| 16000/16000 [00:00<00:00, 27292.91 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 45295.10 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 45592.49 examples/s]


In [33]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})


In [43]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from sklearn.datasets import make_classification

df = tokenized_dataset['train'].to_pandas()

# Separate features (X) and labels (y)
X = df.drop(columns='labels')  # Assuming the label column is named 'label'
y = df['labels']

# Create a RandomUnderSampler instance
undersampler = RandomUnderSampler(random_state=42)

# Resample the dataset
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Convert the resampled data back to a DataFrame (if necessary)
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)  # Replace columns with appropriate column names
y_resampled_df = pd.DataFrame(y_resampled, columns=['labels'])  # Replace 'label' with your target column name

# Combine X and y back into a single DataFrame (if necessary)
resampled_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)


# Now X_resampled and y_resampled contain the balanced dataset

In [50]:
print(X_resampled_df.info)
#print(y_resampled_df.info)
#px.bar(y_resampled_df['labels'].value_counts(ascending=True),template='plotly_white')

<bound method DataFrame.info of                                                input_ids  \
11433  [101, 10047, 3110, 2061, 10223, 6508, 2027, 26...   
9343   [101, 1045, 2514, 3811, 27322, 102, 0, 0, 0, 0...   
11833  [101, 1045, 2514, 26608, 2000, 8970, 2017, 200...   
88     [101, 1045, 2514, 2061, 22692, 2076, 2216, 233...   
4119   [101, 1045, 2145, 3335, 2032, 1998, 2514, 2738...   
...                                                  ...   
15871  [101, 1045, 2985, 1037, 2843, 1997, 2051, 3110...   
15943  [101, 1045, 2903, 1996, 2087, 8141, 2514, 7622...   
15970  [101, 1045, 2031, 2000, 6449, 10047, 3110, 349...   
15974  [101, 1045, 2572, 3110, 2172, 2066, 1996, 3124...   
15989  [101, 1045, 2342, 2000, 2079, 2023, 2008, 1998...   

                                          attention_mask  
11433  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
9343   [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
11833  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...  
88     [1, 

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_df, y_resampled_df, test_size=0.2)


In [96]:
from tensorflow.keras import layers
model = Sequential()
model.add(layers.Embedding(5000, 20)) #The embedding layer
model.add(layers.LSTM(15,dropout=0.5)) #Our LSTM layer
model.add(layers.Dense(6,activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 20)          100000    
                                                                 
 lstm_4 (LSTM)               (None, 15)                2160      
                                                                 
 dense_10 (Dense)            (None, 6)                 96        
                                                                 
Total params: 102256 (399.44 KB)
Trainable params: 102256 (399.44 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [123]:
X_train = np.array(X_train)
X_train = tf.convert_to_tensor(X_train, dtype=tf.int64)
#y_train = np.array(y_train)
#y_train = tf.convert_to_tensor(y_train, dtype=tf.int64)
#X_test = tf.convert_to_tensor(X_test, dtype=tf.int64)
#y_test = tf.convert_to_tensor(y_test, dtype=tf.int64)

ValueError: setting an array element with a sequence.

In [118]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).