In [60]:
#!pip install datasets
#!pip install plotly
#!pip install nbformat
#!pip install imbalanced-learn
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [61]:
import numpy as np
import pandas as pd
import tensorflow as tf
import panel as pn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
import warnings; warnings.filterwarnings('ignore')

In [62]:
validation = pd.read_csv('./validation.csv')
train = pd.read_csv('./training.csv')
test = pd.read_csv('./test.csv')
print(train.head())

                                                text  label
0                            i didnt feel humiliated      0
1  i can go from feeling so hopeless to so damned...      0
2   im grabbing a minute to post i feel greedy wrong      3
3  i am ever feeling nostalgic about the fireplac...      2
4                               i am feeling grouchy      3


In [63]:
print('Dataset information:')
print(f'Training data: {train.shape}')
print(f'Validation data: {validation.shape}')
print(f'Test data: {test.shape}')

Dataset information:
Training data: (16000, 2)
Validation data: (2000, 2)
Test data: (2000, 2)


In [64]:
from datasets import Dataset,DatasetDict,Features,Value,ClassLabel

# Don't forget the class label data
class_names = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
ft = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

# Combine Multiple Datasets 
emotions = DatasetDict({
    "train": Dataset.from_pandas(train,features=ft),
    "test": Dataset.from_pandas(test,features=ft),
    "validation": Dataset.from_pandas(validation,features=ft)
    })

# Convert a single DataFrame to a Dataset
# emotions = Dataset.from_pandas(train,features=ft)

emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [65]:
emotions.set_format(type="pandas")
df = emotions["train"][:]

In [66]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)


In [67]:
df

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger
...,...,...,...
15995,i just had a very brief time in the beanbag an...,0,sadness
15996,i am now turning and i feel pathetic that i am...,0,sadness
15997,i feel strong and good overall,1,joy
15998,i feel like this was such a rude comment and i...,3,anger


In [68]:
import plotly.express as px

px.bar(df['label_name'].value_counts(ascending=True),template='plotly_white')

In [69]:
#Tokenizer
'''from transformers import AutoTokenizer
# Load parameters of the tokeniser
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def preprocess_function(examples):
    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
    labels = examples['label']
    model_inputs["text"] = examples['text']
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = emotions.map(preprocess_function, batched=True)
'''


'from transformers import AutoTokenizer\n# Load parameters of the tokeniser\nmodel_ckpt = "distilbert-base-uncased"\ntokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n\ndef preprocess_function(examples):\n    inputs = [doc for doc in examples["text"]]\n    model_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors=\'pt\')\n    labels = examples[\'label\']\n    model_inputs["text"] = examples[\'text\']\n    model_inputs["labels"] = labels\n    return model_inputs\n\ntokenized_dataset = emotions.map(preprocess_function, batched=True)\n'

In [70]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})


In [97]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
from sklearn.datasets import make_classification

df = tokenized_dataset['train'].to_pandas()

# Separate features (X) and labels (y)
X = df.drop(columns='labels')  # Assuming the label column is named 'label'
y = df['labels']

# Create a RandomUnderSampler instance
undersampler = RandomUnderSampler(sampling_strategy={class_label: 572 for class_label in range(6)}, random_state=42)

# Resample the dataset
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Convert the resampled data back to a DataFrame (if necessary)
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)  # Replace columns with appropriate column names
y_resampled_df = pd.DataFrame(y_resampled, columns=['labels'])  # Replace 'label' with your target column name

# Combine X and y back into a single DataFrame (if necessary)
resampled_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)


# Now X_resampled and y_resampled contain the balanced dataset

In [243]:
print(X_resampled_df.info())
print('--------------------')
print(y_resampled_df.info())
#print(y_resampled_df.info)
#px.bar(y_resampled_df['labels'].value_counts(ascending=True),template='plotly_white')

<class 'pandas.core.frame.DataFrame'>
Index: 3432 entries, 11433 to 559
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            3432 non-null   object
 1   input_ids       3432 non-null   object
 2   attention_mask  3432 non-null   object
dtypes: object(3)
memory usage: 107.2+ KB
None
--------------------
<class 'pandas.core.frame.DataFrame'>
Index: 3432 entries, 11433 to 559
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   labels  3432 non-null   int64
dtypes: int64(1)
memory usage: 53.6 KB
None


In [220]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_df, y_resampled_df, test_size=0.2)


In [221]:
#X_train = np.array(X_train['input_ids'])
#X_test = np.array(X_test['input_ids'])
X_train= X_train['input_ids']
X_test= X_test['input_ids']

In [245]:
#print(X_train[390].to_numpy())
print(X_train.to_numpy().dtype)

object


In [242]:
from tensorflow.keras import layers
from keras.optimizers import Adam
model = Sequential()
model.add(layers.Embedding(2000,20)) #The embedding layer
model.add(layers.LSTM(15,dropout=0.5)) #Our LSTM layer
model.add(layers.Dense(6,activation='softmax'))
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 20)          40000     
                                                                 
 lstm_3 (LSTM)               (None, 15)                2160      
                                                                 
 dense_3 (Dense)             (None, 6)                 96        
                                                                 
Total params: 42256 (165.06 KB)
Trainable params: 42256 (165.06 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [196]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).