In [1]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import DataReader
matplotlib.use('TkAgg')  # For non-interactive plotting

In [2]:
# UIT_VFSC
data_reader = DataReader.DataReader("UIT_VFSC")
# Read data
df_train = data_reader.df_train
df_test = data_reader.df_test
df_dev = data_reader.df_dev
df_total = data_reader.df_total

In [4]:
# length of data
len_train = len(df_train)
len_test = len(df_test)
len_dev = len(df_dev)
len_total = len(df_total)
print(len_total)

16175


In [16]:
# % of each data
print(f'% data train: {round(len_train / len_total * 100, 2)}')
print(f'% data test: {round(len_test / len_total * 100, 2)}')
print(f'% data dev: {round(len_dev / len_total * 100, 2)}')

% data train: 70.64
% data test: 19.57
% data dev: 9.79


In [18]:
# Min, Max and average number of words in sentence
num_of_word = 0
min_now = 1000000
max_now = 0
for sentence in df_total["corpus"]:
    words = len(sentence.split())
    num_of_word += words
    max_now = max(max_now, words)
    min_now = min(min_now, words)
print(f'max word in sentence: {max_now}')
print(f'min word in sentence: {min_now}')
print(f'average number of word in sentence: {round(num_of_word / len_total)}')

max word in sentence: 123
min word in sentence: 1
average number of word in sentence: 9


In [22]:
def find_vocab(corpus: list)-> list:
    voc = set()
    for sent in corpus:
        for w in sent.split():
            if not w.isdigit():
                voc.add(w)
    return list(voc)

In [24]:
# number of vocabulary
vocab_total = find_vocab(df_total["corpus"])
vocab_train = find_vocab(df_train["corpus"])
vocab_dev = find_vocab(df_dev["corpus"])
vocab_test = find_vocab(df_test["corpus"])
print(f'vocab total size: {len(vocab_total)}')
print(f'vocab train size: {len(vocab_train)}')
print(f'vocab dev size: {len(vocab_dev)}')
print(f'vocab test size: {len(vocab_test)}')

vocab total size: 4760
vocab train size: 4023
vocab dev size: 1469
vocab test size: 2160


In [20]:
# count label in each dataset
label_series = pd.Series({
    0: "negative",
    1: "neutral",
    2: "positive"
})
count_label_train = df_train["label"].value_counts()
count_label_dev = df_dev["label"].value_counts()
count_label_total = df_total["label"].value_counts()
count_label_test = df_test["label"].value_counts()
df_label_train = pd.concat([label_series, count_label_train], axis=1, keys=["label", "count"])
df_label_dev = pd.concat([label_series, count_label_dev], axis=1, keys=["label", "count"])
df_label_total = pd.concat([label_series, count_label_total], axis=1, keys=["label", "count"])
df_label_test = pd.concat([label_series, count_label_test], axis=1, keys=["label", "count"])
print(f'dataframe of train dataset:')
print(df_label_train)
print(f'dataframe of dev dataset:')
print(df_label_dev)
print(f'dataframe of test dataset:')
print(df_label_test)
print(f'dataframe of total dataset:')
print(df_label_total)

dataframe of train dataset:
      label  count
0  negative   5325
1   neutral    458
2  positive   5643
dataframe of dev dataset:
      label  count
0  negative    705
1   neutral     73
2  positive    805
dataframe of test dataset:
      label  count
0  negative   1409
1   neutral    167
2  positive   1590
dataframe of total dataset:
      label  count
0  negative   7439
1   neutral    698
2  positive   8038


In [21]:
# plot value in stacked bar chart
plt.bar(df_label_train["label"], df_label_train["count"], color="red")
plt.bar(df_label_dev["label"], df_label_dev["count"], bottom=df_label_train["count"], color="blue")
plt.bar(df_label_test["label"], df_label_test["count"], bottom=df_label_train["count"] + df_label_dev["count"],
        color="green")
plt.xlabel("Label")
plt.ylabel("Quantity")
plt.legend(["Train", "Dev", "Test"])
plt.title("Quantity of each label in train dev test dataset")
plt.show()