# Dictionary Exploration
This notebook explores the files in `../Dataset/cv-corpus-20.0-delta-2024-12-06/en`.

> **Conclusion:** The information in the dictionaries is limited and useless. There is no way to match the dictionaries with the audios.

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
data_path = "../Dataset/cv-corpus-20.0-delta-2024-12-06/en/"

os.listdir(data_path)

['validated_sentences.tsv',
 '.DS_Store',
 'clips',
 'unvalidated_sentences.tsv',
 'validated.tsv',
 'clip_durations.tsv',
 'other.tsv',
 'reported.tsv',
 'invalidated.tsv']

## Dictionaries
- validated_sentences.tsv
- unvalidated_sentences.tsv
- validated.tsv
- clip_durations.tsv
- other.tsv
- reported.tsv
- invalidated.tsv

In [3]:
validated_sentences = pd.read_csv(data_path + "validated_sentences.tsv", sep="\t")
validated_sentences.shape

(1675168, 6)

In [4]:
validated_sentences.iloc[0]

sentence_id        00096855bf293f19ffc573be5ce24a1514c176f0bf1bee...
sentence                 Oh well, I'll just set up in another Qdoba.
sentence_domain                                                  NaN
source                                                 Self Citation
is_used                                                            1
clips_count                                                        0
Name: 0, dtype: object

In [5]:
# We will work only with validates sentences
validated_sentences_ids = validated_sentences["sentence_id"].values

In [6]:
unvalidated_sentences = pd.read_csv(data_path + "unvalidated_sentences.tsv", sep="\t")
unvalidated_sentences.shape

(33101, 4)

In [7]:
unvalidated_sentences.iloc[0]

sentence_id        00027996d48af96a15417e61bdf0b3b11fc250545715bd...
sentence           Dana Hourani - Dana Hourani is a lifestyle blo...
sentence_domain                                                  NaN
source             https://hf.co/datasets/nyuuzyou/chatgpt-in-rus...
Name: 0, dtype: object

In [8]:
# This file could be replaced with other.tsv
validated = pd.read_csv(data_path + "validated.tsv", sep="\t")
print(validated.shape) # Only 250 audios are in this dataset. The 250 audios present in validated_sentences

(250, 13)


In [9]:
validated.iloc[0]

client_id          031903093b6fa1aeb0a243843eb9ed57baf6e99d1f8f92...
path                                    common_voice_en_41383256.mp3
sentence_id        f19a785911b1a3b1338e3eb5cc785e58b8381d21ec7c33...
sentence           The outer rim has undergone some erosion due t...
sentence_domain                                                  NaN
up_votes                                                           2
down_votes                                                         0
age                                                              NaN
gender                                                           NaN
accents                                                          NaN
variant                                                          NaN
locale                                                            en
segment                                                          NaN
Name: 0, dtype: object

In [10]:
clip_durations = pd.read_csv(data_path + "clip_durations.tsv", sep="\t")
clip_durations.shape

(27408, 2)

In [11]:
clip_durations.iloc[0] # It's not clear how we can match the information between files!

clip            common_voice_en_41447677.mp3
duration[ms]                            6660
Name: 0, dtype: object

In [12]:
other = pd.read_csv(data_path + "other.tsv", sep="\t")
other.shape

(27087, 13)

In [13]:
other.iloc[0] 

client_id          0b58339ed9062b97e92c9cc6a5d46f8a517bdd8f5d6b83...
path                                    common_voice_en_41236242.mp3
sentence_id        f1662e17d5758c78c88f7ccd83a54d93c2bfbbfd0cb6ea...
sentence           She studied theatre and spent time as a direct...
sentence_domain                                                  NaN
up_votes                                                           0
down_votes                                                         0
age                                                              NaN
gender                                                           NaN
accents                                                          NaN
variant                                                          NaN
locale                                                            en
segment                                                          NaN
Name: 0, dtype: object

In [14]:
# We have age information for 75% of the data and gender information for 43%
# sentence_domain, variant and segment are useless.
other.isna().mean()

client_id          0.000000
path               0.000000
sentence_id        0.000000
sentence           0.000000
sentence_domain    0.998191
up_votes           0.000000
down_votes         0.000000
age                0.253627
gender             0.565474
accents            0.528150
variant            1.000000
locale             0.000000
segment            1.000000
dtype: float64

In [15]:
# All the files in other are in validated_sentences
print(len(set(other["sentence_id"].values) - set(validated_sentences_ids)))
# validated_sentences has 1'648.601 files that are not in other
len(set(validated_sentences_ids) - set(other["sentence_id"].values))

0


1648601

In [16]:
reported = pd.read_csv(data_path + "reported.tsv", sep="\t")
reported.shape

(124, 4)

In [17]:
reported.iloc[0]

sentence_id    1bdd4311452a2877c0debda1718696a680f73d6feeffa4...
sentence       Minu lemmiktegelane läbi aegade on Katniss Eve...
locale                                                        en
reason                                     sc-different-language
Name: 0, dtype: object

In [18]:
invalidated = pd.read_csv(data_path + "invalidated.tsv", sep="\t")
invalidated.shape

(63, 13)

In [19]:
invalidated.iloc[0]

client_id          29b8505586cd43382cd695da6b943f401104be710a5b60...
path                                    common_voice_en_41281293.mp3
sentence_id        f17037ded2368b480137e6de57109b67c2c41efb3b8ce0...
sentence           He also contributed to the leading Hebrew and ...
sentence_domain                                                  NaN
up_votes                                                           0
down_votes                                                         2
age                                                         fourties
gender                                               female_feminine
accents                                             Scottish English
variant                                                          NaN
locale                                                            en
segment                                                          NaN
Name: 0, dtype: object

In [20]:
# Cleaning validated sentences
corrupt_ids = pd.concat([invalidated["sentence_id"], reported["sentence_id"]]).reset_index(drop = True)
corrupt_ids = corrupt_ids[corrupt_ids.notna()].values

# New validated_sentences_ids
validated_sentences_ids = list(set(validated_sentences_ids) - set(corrupt_ids))