# Imports

In [1]:
import numpy as np
%cd ..
# I need this to access modules from the main folder. Do not move this comment up into the same line

import pandas as pd
from tqdm.notebook import tqdm
import swifter
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as st

from generate_dataframe import generate_sentences_and_meta_df_from_multiple_files
from load_saved_dataframe import load_saved_df
from numerals import *
from concordance import *
from helper_methods import try_apply, drop_na_with_count

C:\Users\karla\PycharmProjects\gender_linguistics_on_ParlaMint


# Set Parameters

In [2]:
save_name = "200_files_2023-02-27"
saving = False
from_saved = False
number_of_files = 100
random_seed = 1341995

# Run

## Load / Generate

In [None]:
if from_saved:
    df = load_saved_df(f"C:/Users/karla/Desktop/Zula_Data_all_in_one/{save_name}.csv")
else:
    df = generate_sentences_and_meta_df_from_multiple_files(number_of_files=number_of_files, random_seed=random_seed)
    if saving:
        df.to_csv(f"C:/Users/karla/Desktop/Zula_Data_all_in_one/{save_name}.csv")

Generating Dataframe:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
n_sentences = df.shape[0]
print(f"total # of sentences: {n_sentences}")
df, dropped_due_to_original_nan = drop_na_with_count(df)
print(f"#numbers_dropped_due_to_original_nan: {dropped_due_to_original_nan}")

## Numerals

In [None]:
func_arg_res = [(group_nums, 'sentence', 'sentence_grouped_nums'),
             (parse_num_groups, 'sentence_grouped_nums', 'sentence_parsed_num_groups'),
             (num_list, 'sentence_parsed_num_groups', 'NUMs')]
            # function, argument_column, result_column

drops_per_func = {}

for func, arg_col, res_col in func_arg_res:
    func_name = func.__name__
    print(f"Applying {func_name}")
    df[res_col] = df[arg_col].swifter.apply((lambda c: try_apply(func, c)))
    df, drops_per_func[func_name] = drop_na_with_count(df)

print("drop_reason".rjust(16)+" | #sentences_dropped"+"\n"+("="*37))
for func, _, _ in func_arg_res:
    func_name = func.__name__
    print(func_name.rjust(16)+f" | {drops_per_func[func_name]}")

In [None]:
df = df.explode('NUMs').reset_index(drop=True)
df['num_index'], placeholder = zip(*df['NUMs'])
(df['num_as_str'], df['num_value']) = zip(*placeholder)
n_numbers = df.shape[0]
print(f"#numbers: {n_numbers}")

In [None]:
df['is_float-like'], df['n_proper_digits'], df['n_zeroes'], df['n_decimals'] = zip(*df['num_as_str'].swifter.apply(lambda cell: try_apply(find_roundedness, cell)))

df, dropped_by_find_roundedness = drop_na_with_count(df)

print(f"#numbers_dropped_by_find_roundedness: {dropped_by_find_roundedness}")

In [None]:
def find_relative_error(row):
    if row['is_float-like']:
        return np.nan # TODO. propably make num_list return number of digits before and after decimal point as well. then relative_error(floaty) = 10**(-n_nachkommastellen) / num_value (i think. make a couple of examples to prove this)
    else:
        return (10**row['n_zeroes'])/row['num_value']

df['relative_error'] = df.apply(find_relative_error, axis=1)

In [None]:
df

## Statistics

In [None]:
dfs = {'float_likes_only_df': df[df['is_float-like']],
       'int_likes_only_df': df[df['is_float-like'] == False]
       }

binary_independent_variables = {'is_upper_house': ('house', 'lower_house', 'upper_house'),
                    'is_chairperson': ('speaker_role', 'regular', 'chairperson'),
                    'is_mp': ('speaker_type', 'notMP', 'MP'),
                    'is_female': ('gender', 'male', 'female')
                    }
                    # column_name, variable_name, value1 (False), value2 (True)
# TODO: Party_status! -> dropna??!!!!!!!!!!!!!!!!!!!!!!


dependent_variables = ['num_value', 'n_proper_digits', 'n_zeroes', 'relative_error']
max_dv_name_len = max((len(_dv) for _dv in dependent_variables))

### Binary Independent Variables

In [None]:
for biv in binary_independent_variables.keys():
    print("="*20,"\n",biv,"\n","-"*18)
    print(df[biv].value_counts())

In [None]:
for df_name, _df in dfs.items():
    print(f"{'='*len(df_name)}\n{df_name}\n{'='*len(df_name)}")

    for biv in binary_independent_variables.keys():
        print(f"{'-'*len(biv)}\n{biv}\n{'-'*len(biv)}")

        # mannwhitneyu can only be calculated for two samples. sometimes, only one sample exists
        # (e.g. if all speakers are MPs --> _df[_df['is_mp'] == False] is empty)
        if len(_df[_df[biv] == True][dv]) != 0 and len(_df[_df[biv] == False][dv]) != 0:

            for dv in dependent_variables:
                mannwhitneyu_result = st.mannwhitneyu(_df[_df[biv] == True][dv], _df[_df[biv] == False][dv])
                print(f"{dv.ljust(max_dv_name_len)}: {mannwhitneyu_result}")

                pointbiserial_result = st.pointbiserialr(_df[biv], _df[dv])
                print(f"{dv.ljust(max_dv_name_len)}: {pointbiserial_result}")

                print("")
        else:
            print("binary independent variable only takes on one value --> 'unary' independent variable")

### Semi-Binary Independent Variable 'Party_status'

## Plots

### Histograms

In [None]:
for df_name, _df in dfs.items():
    print(f"{'='*len(df_name)}\n{df_name}\n{'='*len(df_name)}")
    for dv in dependent_variables:

        plt.title("complete data")

        if dv == 'num_value':
            plt.xscale('log')
            bins = [10**(i) for i in range(15)]
        else:
            bins = range(15)

        plt.xlabel(dv)

        plt.hist(_df[dv], bins=bins)

        plt.show()

In [None]:
for df_name, _df in dfs.items():
    print(f"{'='*len(df_name)}\n{df_name}\n{'='*len(df_name)}")

    for biv_key, (biv_name, biv_false_value, biv_true_value) in binary_independent_variables.items():

        for biv_value, biv_value_bool in ((biv_false_value, False), (biv_true_value, True)):

            for dv in dependent_variables:

                plt.title(f"{biv_name}: {biv_value}")

                if dv == 'num_value':
                    plt.xscale('log')
                    bins = [5**i for i in range(30)]
                else:
                    bins = range(15)

                plt.xlabel(dv)

                plt.hist(_df[_df[biv_key] == biv_value_bool][dv], bins=bins)

                plt.show()

### Scatterplots

In [None]:
for df_name, _df in dfs.values():
    print(df_name)
    for column_name, ind_variable, value_1, value_2 in col_indvar_v1_v2:
        # TODO: ('Party_status', 'party_status', '?', '??'),
        data = _df.copy()
        data[ind_variable] = np.choose(data[column_name], [value_1, value_2])
        sns.scatterplot(x='num_value', y='n_zeroes', hue=ind_variable, data=data)
        plt.xscale('log')
        plt.show()

## Concordance

In [None]:
df['num_ancestors'] = df.swifter.apply(concordance_ancestors_on_row, axis=1)

In [None]:
df['num_ancestor_set'] = df['num_ancestors'].swifter.apply(lambda cell: try_apply(ancestry_set, cell))

In [None]:
if saving:
    df.to_csv(f"C:/Users/karla/Desktop/Zula_Data_all_in_one/{save_name}+group+parsed+separated+exploded+roundedness+ancestors.csv")

In [None]:
df['num_descendants'] = df.swifter.apply(concordance_descendants_on_row, axis=1)

In [None]:
df['num_descendant_set'] = df['num_descendants'].swifter.apply(ancestry_set)

In [None]:
if saving:
    df.to_csv(f"C:/Users/karla/Desktop/Zula_Data_all_in_one/{save_name}+group+parsed+separated+exploded+roundedness+ancestors+descendants.csv")

In [None]:
value_count_num_descendant_set = df['num_descendant_set'].value_counts()
value_count_num_descendant_set

In [None]:
value_count_num_ancestor_set = df['num_ancestor_set'].value_counts()
value_count_num_ancestor_set

In [None]:
float_df = df[df['float-like']]

In [None]:
int_df = df[df['float-like']==False]

In [None]:
data = int_df.copy()
sns.scatterplot(x='num_value', y='n_zeroes', hue='Party_status', data=data)
plt.xscale('log')
plt.show()