In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from environment_constants import COLUMN_NAMES
from random import shuffle
from dataloader import sentences_and_meta_df, make_conllu_files_list, make_meta_files_list, get_meta_file_path
from understanding_the_corpus import values_in_column, files_where_column_not_empty, files_where_column_has_value, count_words_in_sentence

In [2]:
conllu_list = make_conllu_files_list()

In [3]:
total_word_count_by_house = None
total_word_count_by_gender = None
total_word_count_by_party_status = None

total_sentences_by_house = None
total_sentences_by_gender = None
total_sentences_by_party_status = None

total_speakers_by_house = None
total_speakers_by_gender = None
total_speakers_by_party_status = None

for i, conllu_path in tqdm(conllu_list):
    meta_path = get_meta_file_path(conllu_path)
    meta_df = pd.read_csv(meta_path, sep = '\t')[COLUMN_NAMES]
    df = sentences_and_meta_df(conllu_path)

    # Add column containing word counts per sentence
    df['word_count'] = df['sentence_df'].apply(count_words_in_sentence)

    # group dataframe by house, gender and party status
    df_by_house = df.groupby('House')
    df_by_gender = df.groupby('Speaker_gender')
    df_by_party_status =df.groupby('Party_status')

    # Aggregate by word counts
    word_count_by_house = df_by_house['word_count'].sum()
    word_count_by_gender = df_by_gender['word_count'].sum()
    word_count_by_party_status = df_by_party_status['word_count'].sum()

    # Aggregate by #sentences
    sentences_by_house = df_by_house['sent_id'].count()
    sentences_by_gender = df_by_gender['sent_id'].count()
    sentences_by_party_status = df_by_party_status['sent_id'].count()

    # Aggregate unique speakers
    speakers_by_house = df_by_house['Speaker_name'].value_counts()
    speakers_by_gender = df_by_gender['Speaker_name'].value_counts()
    speakers_by_party_status = df_by_party_status['Speaker_name'].value_counts()

    if total_speakers_by_house is None:
        # first iteration
        total_word_count_by_house = word_count_by_house
        total_word_count_by_gender = word_count_by_gender
        total_word_count_by_party_status = word_count_by_party_status

        total_sentences_by_house = sentences_by_house
        total_sentences_by_gender = sentences_by_gender
        total_sentences_by_party_status = sentences_by_party_status

        total_speakers_by_house = speakers_by_house
        total_speakers_by_gender = speakers_by_gender
        total_speakers_by_party_status = speakers_by_party_status
    else:
        # All other iterations
        total_word_count_by_house = total_word_count_by_house.add(word_count_by_house, fill_value=0)
        total_word_count_by_gender = total_word_count_by_gender.add(word_count_by_gender, fill_value=0)
        total_word_count_by_party_status = total_word_count_by_party_status.add(word_count_by_party_status, fill_value=0)

        total_sentences_by_house = total_sentences_by_house.add(sentences_by_house, fill_value=0)
        total_sentences_by_gender = total_sentences_by_gender.add(sentences_by_gender, fill_value=0)
        total_sentences_by_party_status = total_sentences_by_party_status.add(sentences_by_party_status, fill_value=0)

        total_speakers_by_house = total_speakers_by_house.add(speakers_by_house, fill_value=0)
        total_speakers_by_gender = total_speakers_by_gender.add(speakers_by_gender, fill_value=0)
        total_speakers_by_party_status = total_speakers_by_party_status.add(speakers_by_party_status, fill_value=0)

    if i == 250:
        break

0it [00:00, ?it/s]

In [8]:
print(total_word_count_by_house)

House
Lower house    8208978.0
Upper house    5799556.0
Name: word_count, dtype: float64


In [9]:
print(total_word_count_by_gender)

Speaker_gender
-       4854.0
F    4231840.0
M    9771840.0
Name: word_count, dtype: float64


In [10]:
print(total_word_count_by_party_status)

Party_status
-                 4854.0
Coalition       360614.0
Opposition    13533466.0
Name: word_count, dtype: float64


In [11]:
print(total_sentences_by_house)

House
Lower house    346025.0
Upper house    245360.0
Name: sent_id, dtype: float64


In [12]:
print(total_sentences_by_gender)

Speaker_gender
-       574.0
F    178727.0
M    412084.0
Name: sent_id, dtype: float64


In [13]:
print(total_sentences_by_party_status)

Party_status
-                574.0
Coalition      15373.0
Opposition    570848.0
Name: sent_id, dtype: float64


In [14]:
total_speakers_by_party_status.groupby('Party_status').count()

Party_status
-                1
Coalition      305
Opposition    1657
Name: Speaker_name, dtype: int64

In [15]:
total_speakers_by_house.groupby('House').count()

House
Lower house    951
Upper house    773
Name: Speaker_name, dtype: int64

In [16]:
total_speakers_by_gender.groupby('Speaker_gender').count()

Speaker_gender
-       1
F     528
M    1165
Name: Speaker_name, dtype: int64