In [1]:
!pip install pandas matplotlib

import ast
import pandas as pd
import matplotlib.pyplot as plt



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('EMMA_1mio-v1.0-index.csv')
df['industries'] = df['industries'].apply(ast.literal_eval)

print(f'Number of samples: {df.shape[0]}')
print(f'And columns: {df.columns}')

FileNotFoundError: [Errno 2] No such file or directory: 'EMMA_1mio-v1.0-index.csv'

Compute text length distributions

In [None]:
char_bins = [i for i in range(0, 20001, 1000)]
char_bins.append(float('inf'))
char_histogram_counts = pd.cut(df['characters'], bins=char_bins).value_counts().sort_index()
char_histogram_percentages = (char_histogram_counts / df.shape[0]) * 100

word_bins = [i for i in range(0, 2001, 100)]
word_bins.append(float('inf'))
word_histogram_counts = pd.cut(df['word_tok'], bins=word_bins).value_counts().sort_index()
word_histogram_percentages = (word_histogram_counts / df.shape[0]) * 100

sent_bins = [i for i in range(0, 201, 10)]
sent_bins.append(float('inf'))
sent_histogram_counts = pd.cut(df['sentences'], bins=sent_bins).value_counts().sort_index()
sent_histogram_percentages = (sent_histogram_counts / df.shape[0]) * 100

fig, axs = plt.subplots(1, 3, figsize=(20, 5))  # 1 row, 2 columns
char_histogram_percentages.plot(
    ax=axs[0], kind='bar',
    title='Histogram of Character Counts in Text Samples', 
    xlabel='Character Count Intervals',
    ylabel='Percentage of Samples'
)
axs[0].set_xticklabels(char_bins[1:])
word_histogram_percentages.plot(
    ax=axs[1], kind='bar', 
    title='Histogram of Word Token Counts in Text Samples', 
    xlabel='Word Token Count Intervals',
    ylabel='Percentage of Samples'
)
axs[1].set_xticklabels(word_bins[1:])
sent_histogram_percentages.plot(
    ax=axs[2], kind='bar', 
    title='Histogram of Sentence Counts in Text Samples', 
    xlabel='Sentence Count Intervals',
    ylabel='Percentage of Samples'
)
axs[2].set_xticklabels(sent_bins[1:])
plt.show()

Compute distribution by language

In [None]:
import matplotlib.pyplot as plt

language_counts = df['language'].value_counts()
filtered_language_counts = language_counts[language_counts > 100]
filtered_language_counts.plot(kind='bar')
plt.xlabel('Languages')
plt.ylabel('Number of samples')
plt.title('Number of samples for each language')
plt.show()

Distributions over the type of source (Media Type)

In [None]:
industries = df['media_type'].explode()
industries.value_counts().plot(kind='bar')
plt.xlabel('Media Types')
plt.ylabel('Number of samples')
plt.title('Number of samples for each Media Type')
plt.show()

And distributions over Industries / Domains

In [None]:
industries = df['industries'].explode()
industries.value_counts().plot(kind='bar')
plt.xlabel('Industries')
plt.ylabel('Number of samples')
plt.title('Number of samples for each industry')
plt.show()

Compute average characters, words and sentences per sub-word token for each language. (cl100k_base and SentencePiece encodings)

In [None]:
df['char_cl100k_ratio'] = df['characters'] / df['cl100k_tok']
df['word_cl100k_ratio'] = df['word_tok'] / df['cl100k_tok']
df['sent_cl100k_ratio'] = df['sentences'] / df['cl100k_tok']
df['char_sp_ratio'] = df['characters'] / df['sp_tok']
df['word_sp_ratio'] = df['word_tok'] / df['sp_tok']
df['sent_sp_ratio'] = df['sentences'] / df['sp_tok']

language_statistics = df.groupby('language').agg(
    avg_chrs_cl100k=pd.NamedAgg(column='char_cl100k_ratio', aggfunc='mean'),
    avg_words_cl100k=pd.NamedAgg(column='word_cl100k_ratio', aggfunc='mean'),
    avg_sents_cl100k=pd.NamedAgg(column='sent_cl100k_ratio', aggfunc='mean'),
    avg_chrs_sp=pd.NamedAgg(column='char_sp_ratio', aggfunc='mean'),
    avg_words_sp=pd.NamedAgg(column='word_sp_ratio', aggfunc='mean'),
    avg_sents_sp=pd.NamedAgg(column='sent_sp_ratio', aggfunc='mean'),
    count=pd.NamedAgg(column='characters', aggfunc='count')
)

filtered_language_statistics = language_statistics[language_statistics['count'] > 1000].sort_values(by=['count'], ascending=[False])
display(filtered_language_statistics)