In [1]:
import warnings
warnings.filterwarnings("ignore")

<div align="center">
    <h4> Step 0 | Importing Libraries </h4>
</div>

In [2]:
import datasets
import pandas as pd
import numpy as np
import plotly.express as px

<div align="center">
    <h4> Step 1 | Dataset Importing </h4>
</div>

In [3]:
dataset = datasets.load_dataset("glue", "sst2")
dataset = dataset["train"].to_pandas()
dataset.head(10)

Unnamed: 0,sentence,label,idx
0,hide new secretions from the parental units,0,0
1,"contains no wit , only labored gags",0,1
2,that loves its characters and communicates som...,1,2
3,remains utterly satisfied to remain the same t...,0,3
4,on the worst revenge-of-the-nerds clichés the ...,0,4
5,that 's far too tragic to merit such superfici...,0,5
6,demonstrates that the director of such hollywo...,1,6
7,of saucy,1,7
8,a depressed fifteen-year-old 's suicidal poetry,0,8
9,are more deeply thought through than in most `...,1,9


<div align="center">
    <h4> Step 2 | Summary Statistics </h4>
</div>

In [4]:
def get_label_counts(dataset):
    label_counts = dataset["label"].value_counts()
    
    print(f'-'*55)
    print(f'Total Number of Labels: {len(label_counts)}')
    print(f'-'*55)

    for label in label_counts.index:
        print(f'Total Number of Samples with Label {label}: {label_counts[label]} | {round(label_counts[label]/len(dataset)*100, 2)}%')

    print(f'-'*55)
    print(f'Total Number of Samples in the Dataset: {len(dataset)}')
    print(f'-'*55)

get_label_counts(dataset)

-------------------------------------------------------
Total Number of Labels: 2
-------------------------------------------------------
Total Number of Samples with Label 1: 37569 | 55.78%
Total Number of Samples with Label 0: 29780 | 44.22%
-------------------------------------------------------
Total Number of Samples in the Dataset: 67349
-------------------------------------------------------


In [5]:
def get_stats(dataset):
    length = [len(sentence.split()) for sentence in dataset["sentence"]]

    print(f'-'*55)
    print('Mean Sentence Length:', np.mean(length)) 
    print(f'-'*55)
    print('0th Percentile Sentence Length:', np.percentile(length, 0))
    print('10th Percentile Sentence Length:', np.percentile(length, 10))
    print('25th Percentile Sentence Length:', np.percentile(length, 25))
    print('50th Percentile Sentence Length:', np.percentile(length, 50))
    print('75th Percentile Sentence Length:', np.percentile(length, 75))
    print('90th Percentile Sentence Length:', np.percentile(length, 90))
    print('100th Percentile Sentence Length:', np.percentile(length, 100))
    print(f'-'*55)

get_stats(dataset)

-------------------------------------------------------
Mean Sentence Length: 9.409553222765
-------------------------------------------------------
0th Percentile Sentence Length: 1.0
10th Percentile Sentence Length: 2.0
25th Percentile Sentence Length: 3.0
50th Percentile Sentence Length: 7.0
75th Percentile Sentence Length: 13.0
90th Percentile Sentence Length: 21.0
100th Percentile Sentence Length: 52.0
-------------------------------------------------------


In [6]:
def get_vocab_stats(dataset):
    num_tokens = []
    total_tokens = 0

    for sentence in dataset["sentence"]:
        tokens = sentence.split()
        num_tokens.append(len(tokens))
        total_tokens += len(tokens)

    print(f'-'*55)
    print(f'Vocabulary Size of the Dataset: {len(set(dataset["sentence"]))}')
    print(f'-'*55)
    print(f'Avg. Tokens per Sentence: {round(total_tokens/len(dataset), 2)}')
    print(f'-'*55)

get_vocab_stats(dataset)

-------------------------------------------------------
Vocabulary Size of the Dataset: 66978
-------------------------------------------------------
Avg. Tokens per Sentence: 9.41
-------------------------------------------------------


<div align="center">
    <h4> Step 3 | Data Visualization </h4>
</div>

In [7]:
fig = px.histogram(
    dataset, 
    x="label", 
    color="label",
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=310,
)
fig.update_layout(
    title_text="Label Distribution in the Dataset",
    xaxis_title_text="Label",
    yaxis_title_text="Count",
    bargap=0.1,
)
fig.update_traces(showlegend=True, opacity=0.8)
fig.show()

In [8]:
length = [len(sentence.split()) for sentence in dataset["sentence"]]

fig = px.histogram(
    dataset, 
    x=length, 
    color="label",
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=1600,
)
fig.update_layout(
    title_text="Sentence Length Distribution in the Dataset",
    xaxis_title_text="Sentence Length",
    yaxis_title_text="Count",
    bargap=0.1,
)
fig.update_traces(showlegend=True, opacity=0.8)
fig.show()