Imports
====

In [1]:
import numpy as np
%cd ..
# I need this to access modules from the main folder. Do not move this comment up into the same line

import pandas as pd
from tqdm.notebook import tqdm
import swifter
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as st

from generate_dataframe import generate_sentences_and_meta_df_from_multiple_files
from load_saved_dataframe import load_saved_df
from numerals import *
from concordance import *
from helper_methods import try_apply, drop_na_with_count

C:\Users\karla\PycharmProjects\gender_linguistics_on_ParlaMint


Set Parameters
===

In [2]:
save_name = "test_2023_01_30"
saving = False
from_saved = False
number_of_files = 2

Run
===

Load / Generate
---

In [3]:
if from_saved:
    df = load_saved_df(f"C:/Users/karla/Desktop/Zula_Data_all_in_one/{save_name}.csv")
else:
    df = generate_sentences_and_meta_df_from_multiple_files(number_of_files=number_of_files)
    if saving:
        df.to_csv(f"C:/Users/karla/Desktop/Zula_Data_all_in_one/{save_name}.csv")

Generating Dataframe:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
n_sentences = df.shape[0]
print(f"total # of sentences: {n_sentences}")
df, _ = drop_na_with_count(df) #, "missing data")

total # of sentences: 1015


Numerals
---

In [5]:
func_list = [(group_nums, 'sentence', 'sentence_grouped_nums'),
             (parse_num_groups, 'sentence_grouped_nums', 'sentence_parsed_num_groups'),
             (num_list, 'sentence_parsed_num_groups', 'NUMs')]
            # function, argument_column, result_column

drops_per_func = {}

for func, arg_col, res_col in func_list:
    func_name = func.__name__
    print(f"Applying {func_name}")
    df[res_col] = df[arg_col].swifter.apply((lambda c: try_apply(func, c)))
    df, drops_per_func[func_name] = drop_na_with_count(df)

print("drop_reason".rjust(16)+" | #sentences_dropped"+"\n"+("="*37))
for func, _, _ in func_list:
    func_name = func.__name__
    print(func_name.rjust(16)+f" | {drops_per_func[func_name]}")

Applying group_nums


Pandas Apply:   0%|          | 0/1015 [00:00<?, ?it/s]

Applying parse_num_groups


Pandas Apply:   0%|          | 0/1005 [00:00<?, ?it/s]

Applying num_list


Pandas Apply:   0%|          | 0/1002 [00:00<?, ?it/s]

     drop_reason | #sentences_dropped
      group_nums | 10
parse_num_groups | 3
        num_list | 0


In [6]:
df = df.explode('NUMs').reset_index(drop=True)
df['num_index'], placeholder = zip(*df['NUMs'])
(df['num_as_str'], df['num_value']) = zip(*placeholder)
n_numbers = df.shape[0]
print(f"#numbers: {n_numbers}")

#numbers: 1623


In [7]:
df['float-like'], df['proper_digits'], df['zeroes'] = zip(*df['num_as_str'].swifter.apply(lambda cell: try_apply(find_roundedness, cell)))

df, dropped_by_find_roundedness = drop_na_with_count(df)

print(f"#numbers_dropped_by_find_roundedness: {dropped_by_find_roundedness}")

Pandas Apply:   0%|          | 0/1623 [00:00<?, ?it/s]

#numbers_dropped_by_find_roundedness: 2


Statistics
---

In [8]:
df.columns

Index(['sent_id', 'sentence', 'utterance_id', 'Speaker_party', 'Party_status',
       'Speaker_name', 'mp', 'female', 'upper_house', 'chairperson',
       'sentence_grouped_nums', 'sentence_parsed_num_groups', 'NUMs',
       'num_index', 'num_as_str', 'num_value', 'float-like', 'proper_digits',
       'zeroes'],
      dtype='object')

In [9]:
binary_independent_variables = ['is_mp', 'is_female', 'is_upper_house', 'is_chairperson'] # TODO: Party_status! -> dropna??!!!!!!!!!!!!!!!!!!!!!!
dependent_variables = ['num_value', 'proper_digits', 'zeroes']

float_df = df[df['float-like'] == True]
int_df = df[df['float-like'] == False]

for df in (float_df, int_df):
    for biv in binary_independent_variables:
        for dv in dependent_variables:
            st.ttest_ind(float_df[float_df[biv] == True][])

IndentationError: expected an indented block (Temp/ipykernel_13364/480003445.py, line 4)

Concordance
---

In [None]:
df['num_ancestors'] = df.swifter.apply(concordance_ancestors_on_row, axis=1)

In [None]:
df['num_ancestor_set'] = df['num_ancestors'].swifter.apply(lambda cell: try_apply(ancestry_set, cell))

In [None]:
if saving:
    df.to_csv(f"C:/Users/karla/Desktop/Zula_Data_all_in_one/{save_name}+group+parsed+separated+exploded+roundedness+ancestors.csv")

In [None]:
df['num_descendants'] = df.swifter.apply(concordance_descendants_on_row, axis=1)

In [None]:
df['num_descendant_set'] = df['num_descendants'].swifter.apply(ancestry_set)

In [None]:
if saving:
    df.to_csv(f"C:/Users/karla/Desktop/Zula_Data_all_in_one/{save_name}+group+parsed+separated+exploded+roundedness+ancestors+descendants.csv")

In [None]:
value_count_num_descendant_set = df['num_descendant_set'].value_counts()
value_count_num_descendant_set

In [None]:
value_count_num_ancestor_set = df['num_ancestor_set'].value_counts()
value_count_num_ancestor_set

In [None]:
float_df = df[df['float-like']]

In [None]:
int_df = df[df['float-like']==False]

In [None]:
for column_name, ind_variable, value_1, value_2 in [('upper_house', 'house', 'lower_house', 'upper_house'), ('chairperson', 'speaker_role', 'regular', 'chairperson'), ('mp', 'speaker_type', 'notMP', 'MP'),  ('female', 'gender', 'male', 'female')]:
    # TODO: ('Party_status', 'party_status', '?', '??'),
    data = int_df.copy()
    data[ind_variable] = np.choose(data[column_name], [value_1, value_2])
    sns.scatterplot(x='num_value', y='zeroes', hue=ind_variable, data=data)
    plt.xscale('log')
    plt.show()

In [None]:
data = int_df.copy()
sns.scatterplot(x='num_value', y='zeroes', hue='Party_status', data=data)
plt.xscale('log')
plt.show()