In [1]:
from utils.data_preparation import get_emotion_dataset, naive_bayes_preprocessing, bert_preprocessing
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = get_emotion_dataset()
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])


### Data Fields

- text: a string feature
- label: a classification label, with possible values including sadness (0), joy (1), love (2), anger (3), fear (4), surprise (5).

In [3]:
# print the first 5 rows of the dataset
df_train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [5]:
# different labels
df_train['label'].unique()

array([0, 3, 2, 5, 4, 1])

In [6]:
# average sentence length
df_train['text'].str.split().str.len().mean()

19.1663125

In [7]:
# maximum sentence length
print(df_train['text'].str.split().str.len().max())
print(df_val['text'].str.split().str.len().max())
print(df_test['text'].str.split().str.len().max())

66
61
61


### Preprocessing (Naive Bayes)

In [8]:
processed_data, vectorizer = naive_bayes_preprocessing(
    remove_stopwords=True, 
    use_bigrams=False
)
X_train, y_train = processed_data['train']
X_val, y_val = processed_data['validation']
X_test, y_test = processed_data['test']

In [9]:
# print shapes of the data
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (16000, 14894)
X_val shape: (2000, 14894)
X_test shape: (2000, 14894)


In [10]:
# print first row of the training data
# NOTE: the data represents the non-zero indices of the sparse matrix
print(X_train[0])


  (0, 3550)	1
  (0, 4838)	1
  (0, 6309)	1


### Preprocessing (BERT)

In [11]:
processed_data, tokenizer = bert_preprocessing()
X_train, Mask_train, y_train = processed_data['train']
X_val, Mask_val, y_val = processed_data['validation']
X_test, Mask_test, y_test = processed_data['test']

In [12]:
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: torch.Size([16000, 70])
X_val shape: torch.Size([2000, 70])
X_test shape: torch.Size([2000, 70])


In [13]:
print(X_train[0])

tensor([  101,  1045,  2134,  2102,  2514, 26608,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
