In [23]:
!pip install pandas matplotlib

import ast
import pandas as pd
import matplotlib.pyplot as plt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
dtype_spec = {
    'sent': int,
    'words': int,
    'tok_b': int,
    'tok_d': int,
    'tok_x': int,
    'tok_s': int,
    'chrs': int
}

df = pd.read_csv('../../data/mulabel/raw/stats-short.csv', dtype=dtype_spec, on_bad_lines='warn')
#df['labels'] = df['labels'].apply(ast.literal_eval)

print(f'Number of samples: {df.shape[0]}')
print(f'And columns: {df.columns}')

ValueError: invalid literal for int() with base 10: 'sent'

Compute text length distributions

In [None]:
char_bins = [i for i in range(0, 20001, 1000)]
char_bins.append(float('inf'))
char_histogram_counts = pd.cut(df['chrs'], bins=char_bins).value_counts().sort_index()
char_histogram_percentages = (char_histogram_counts / df.shape[0]) * 100

word_bins = [i for i in range(0, 2001, 100)]
word_bins.append(float('inf'))
word_histogram_counts = pd.cut(df['tok_w'], bins=word_bins).value_counts().sort_index()
word_histogram_percentages = (word_histogram_counts / df.shape[0]) * 100

sent_bins = [i for i in range(0, 201, 10)]
sent_bins.append(float('inf'))
sent_histogram_counts = pd.cut(df['sent'], bins=sent_bins).value_counts().sort_index()
sent_histogram_percentages = (sent_histogram_counts / df.shape[0]) * 100

fig, axs = plt.subplots(1, 3, figsize=(20, 5))  # 1 row, 2 columns
char_histogram_percentages.plot(
    ax=axs[0], kind='bar',
    title='Histogram of Character Counts in Text Samples', 
    xlabel='Character Count Intervals',
    ylabel='Percentage of Samples'
)
axs[0].set_xticklabels(char_bins[1:])
word_histogram_percentages.plot(
    ax=axs[1], kind='bar', 
    title='Histogram of Word Token Counts in Text Samples', 
    xlabel='Word Token Count Intervals',
    ylabel='Percentage of Samples'
)
axs[1].set_xticklabels(word_bins[1:])
sent_histogram_percentages.plot(
    ax=axs[2], kind='bar', 
    title='Histogram of Sentence Counts in Text Samples', 
    xlabel='Sentence Count Intervals',
    ylabel='Percentage of Samples'
)
axs[2].set_xticklabels(sent_bins[1:])
plt.show()

Compute distribution by language

In [None]:
import matplotlib.pyplot as plt

language_counts = df['language'].value_counts()
filtered_language_counts = language_counts[language_counts > 100]
filtered_language_counts.plot(kind='bar')
plt.xlabel('Languages')
plt.ylabel('Number of samples')
plt.title('Number of samples for each language')
plt.show()

Distributions over the type of source (Media Type)

In [None]:
industries = df['type'].explode()
industries.value_counts().plot(kind='bar')
plt.xlabel('Media Types')
plt.ylabel('Number of samples')
plt.title('Number of samples for each Media Type')
plt.show()

Compute average characters, words and sentences per sub-word token for each language. (XLM-R and Deberta tokenizers)

In [None]:
df['char_xlmr_ratio'] = df['chrs'] / df['tok_x']
df['word_xlmr_ratio'] = df['words'] / df['tok_x']
df['sent_xlmr_ratio'] = df['sent'] / df['tok_x']
df['char_deb_ratio'] = df['chrs'] / df['tok_d']
df['word_deb_ratio'] = df['words'] / df['tok_d']
df['sent_deb_ratio'] = df['sent'] / df['tok_d']

language_statistics = df.groupby('language').agg(
    avg_chrs_cl100k=pd.NamedAgg(column='char_xlmr_ratio', aggfunc='mean'),
    avg_words_cl100k=pd.NamedAgg(column='word_xlmr_ratio', aggfunc='mean'),
    avg_sents_cl100k=pd.NamedAgg(column='sent_xlmr_ratio', aggfunc='mean'),
    avg_chrs_sp=pd.NamedAgg(column='char_deb_ratio', aggfunc='mean'),
    avg_words_sp=pd.NamedAgg(column='word_deb_ratio', aggfunc='mean'),
    avg_sents_sp=pd.NamedAgg(column='sent_deb_ratio', aggfunc='mean'),
    count=pd.NamedAgg(column='characters', aggfunc='count')
)

filtered_language_statistics = language_statistics[language_statistics['count'] > 1000].sort_values(by=['count'], ascending=[False])
display(filtered_language_statistics)