In [24]:
import os
import copy
import json
# import re
import regex as re

import pandas as pd
import numpy as np

import PyPDF2
import gender_guesser.detector as gender

pd.set_option('display.max_columns', 70)

In [2]:
PWD = os.getcwd()
DATA_PATH = PWD + "/data/"

FY_DATA_PATH = DATA_PATH + "financial-statements/"
FY_BASE_NAME = "UBC-FY"

GENDER_DATA = DATA_PATH + "name-gender/data.csv"

## Manually Set page ranges



In [18]:
page_limits = {
    '2020':{'min':46, 'max':148},
    '2021':{'min':42, 'max':141},
    '2022':{'min':45, 'max':165},
    '2023':{'min':44, 'max':172},
    '2024':{'min':43, 'max':188},}

# page_limits = {
#     '2024':{'min':43, 'max':188}}

In [25]:

def null_str_to_nan(test_number):

    if test_number.isnumeric():
        return test_number
    else:
        return np.nan
    

def is_numeric(test_string):

    return test_string.lstrip('-').replace(',','').replace("(", "").replace(")", "").isnumeric()


def standardise_name(text):
    # "john heh h heh h,  h  doe her y" -> "JOHN-HEH-HEH,DOE-HER"
    # Remove single letters, but exclude cases where a single letter is followed by a comma or apostrophe
    text = re.sub(r'\b([a-zA-Z])(?![,\'])\b', '', text)

    # Replace all whitespace (and compress spaces) with a single '-'
    text = re.sub(r'\s+', '-', text.strip())
    # Ensure no hyphen comes immediately before or after a comma
    text = re.sub(r'-,', ',', text)  # Remove hyphen before a comma
    text = re.sub(r',-', ',', text)  # Remove hyphen after a comma

    text = re.sub(r'-+', '-', text)

    return text.upper()


def parse_one_line_information(one_line):
    # Regex pattern to match name, salary, and expense
    pattern = r'([A-Za-z\p{L}.]+[\-,.A-Za-z\p{L},\s]*)\s*([\d,()-]+)\s*([\d,()-]+)'

    # Find all matches
    matches = re.findall(pattern, one_line)

    # Process matches to clean up data
    parsed_data = {
        "name": [],
        "salary": [],
        "expense": []
    }

    for match in matches:
        name, salary, expense = match
        
        # Clean salary and expense
        salary = salary.replace(',', '').replace('(', '-').replace(')', '').strip()
        salary = float(salary) if salary.lstrip('-').isdigit() else np.nan
        
        expense = expense.replace(',', '').replace('(', '-').replace(')', '').strip()
        expense = float(expense) if expense.lstrip('-').isdigit() else np.nan
        
        # Append to parsed_data
        name = standardise_name(name)
        parsed_data["name"].append(name.strip().upper())
        parsed_data["salary"].append(salary)
        parsed_data["expense"].append(expense)

    parse_df = pd.DataFrame(parsed_data)

    return parse_df


def pdf_to_df(pdf_file, year, page_start, page_end):

    pdfFileObj = open(pdf_file, 'rb')
    
    # creating a pdf reader object
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    
    column_header_pattern = r"Name\s*Remuneration\s*Expenses"
    column_header_pattern2 = r"\s*Remuneration\s*Expenses"

    data = pd.DataFrame({'name':[], 'year':[], 'salary':[], 'expense':[]})

    for page in np.arange(page_start, page_end, 1):

        page = int(page)
        pageObj = pdfReader.getPage(page)
        pagetxt = pageObj.extractText()


        begin = 0
        finished_prof = 0
        prof = []
        old_line = ''
        use_old_line = False

        for index, line in enumerate(pagetxt.split('\n')):

            if begin == 1:

                if use_old_line:
                    line = old_line + line
                    use_old_line = False
                    # print(line)


                contains_number = bool(re.search(r'\d', line))
                if not contains_number and 'Remuneration' not in line and line.strip():
                    old_line = line
                    use_old_line = True
                    continue



                if year in [2021, 2022, 2023]:
                    if finished_prof == 1:
                        name = ''
                        pay = null_str_to_nan(prof[-2].replace(',','').replace("(", "").replace(")", ""))
                        exp = null_str_to_nan(prof[-1].replace(',','').replace("(", "").replace(")", ""))
                        for n in prof[0:-2]:
                            name = name + n.upper() + ' '
                            
                        name = standardise_name(name)
                        new_row = pd.DataFrame({'name':[name], 'year':[int(year)], 'salary':[pay], 'expense':[exp]})
                        data = pd.concat([data, new_row], ignore_index=True)
                        
                        prof = []
                        finished_prof = 0
                        
                    if len(line.split()) != 0:                       

                        cleaned_line = re.sub(column_header_pattern, "", line).replace("*", "").strip()

                        if finished_prof == 0 and (cleaned_line.split()[-1] == '-' or is_numeric(cleaned_line.split()[-1])) and "$" not in cleaned_line: 
                            for k in cleaned_line.split():
                                prof.append(k)
                            finished_prof = 1
                            
            
                        else:
                            for k in line.split():
                                prof.append(k)               
                            finished_prof = 0
                
                elif year in [2020, 2024]:
                    
                    if year == 2024:
                        cleaned_line = re.sub(column_header_pattern, "", line).replace("*", "").replace("(", "").replace(")", "").strip()
                    elif year == 2020:
                        cleaned_line = re.sub(column_header_pattern2, "", line).replace("*", "").replace("(", "").replace(")", "").strip()

                    new_row = parse_one_line_information(cleaned_line)
                    new_row['year'] = int(year)

                    data = pd.concat([data, new_row], ignore_index=True)

                old_line = line

            
            if re.search(column_header_pattern, line, re.DOTALL) or re.search(column_header_pattern2, line, re.DOTALL):

                begin = 1
        



    # closing the pdf file object
    pdfFileObj.close()


    return data


In [26]:
df = pd.DataFrame({'name':[], 'year':[], 'salary':[], 'expense':[]})

for i, (year, value) in enumerate(page_limits.items()):

    if year  != '2020':
        print(f'Parsing Year {year}')

        pdf_file = FY_DATA_PATH + FY_BASE_NAME + year + '.pdf'
        page_start = value['min']
        page_end = value['max']

        new_df = pdf_to_df(pdf_file, int(year), page_start, page_end)

        df['year'] = df['year'].astype(int)
        df = pd.concat([df, new_df], ignore_index=True)


df['year'] = df['year'].astype(int)

Parsing Year 2021
Parsing Year 2022
Parsing Year 2023
Parsing Year 2024


### Sorting by length of name
An interesting way to look at longest and shortest names.
But also a check to make sure paarsing is not cutting off names or adding different names together. 

In [27]:
df_sorted = df.sort_values(by='name', key=lambda col: col.str.len(), ascending=True)
df_sorted

Unnamed: 0,name,year,salary,expense
17940,"LI,RI",2023,151182,54163
21067,"VU,LY",2023,169466,13338
1904,"FU,HU",2021,123374,3846
6806,"YUE,.",2021,76515,484
26260,"LI,YI",2024,106805.0,
...,...,...,...,...
21265,"WIJENDRA-ACHARIGE,LASANTHA-PREMARATHNA",2023,114048,
19441,"POTYGUARA-COUTINHO-MARQUES,PAULO-EDUARDO",2023,86307,
4901,"POTYGUARA-COUTINHO-MARQUES,PAULO-EDUARDO",2021,95421,
28066,"POTYGUARA-COUTINHO-MARQUES,PAULO-EDUARDO",2024,115035.0,


### Group people with similar names

There is an inconsistency between years in how names are recorded. We can be somewhat confident they are the same person as the salary year after year is the same. 
Sometimes we find that a person has added their middle name or removed it. 

The first way to 'fix' this is to look for all names which are only recorded for a single year.
Then check that name with all other names to see if there is a match, i.e. does all the names exist in another name or atleast 1 minus the total number of names exists (I assume only 1 single name is added/removed between years).

In [28]:
unique_names = df.groupby('name')['year'].nunique()
list_of_single_names = unique_names[unique_names == 1].index.tolist()


for single_name in list_of_single_names:

    single_components = set(single_name.replace('-', ',').replace("'", ',').split(','))
    single_components1 = set(single_name.split(',')[0].replace('-', ',').replace("'", ',').split(','))
    single_components2 = set(single_name.split(',')[-1].replace('-', ',').replace("'", ',').split(','))


    for unique_name in df['name'].unique().tolist():

        if single_name != unique_name:

            unique_components = set(unique_name.replace('-', ',').replace("'", ',').split(','))
            unique_components1 = set(unique_name.split(',')[0].replace('-', ',').replace("'", ',').split(','))
            unique_components2 = set(unique_name.split(',')[-1].replace('-', ',').replace("'", ',').split(','))

            # Check for a match (length - 1 components must overlap)
            

            if (
                ((len(single_components1) > 0) and (len(single_components1 & unique_components1) >= len(unique_components) - 1) and (len(single_components1 & unique_components1) > 0)) 
                and 
                ((len(single_components2) > 0) and (len(single_components2 & unique_components2) >= len(unique_components) - 2) and (len(single_components2 & unique_components2) > 0)) ):

                df.loc[df['name'] == single_name, 'name'] = unique_name
                break  # Stop checking once a match is found




### Checking for people with the same name

In [29]:
# Count occurrences of each name within each year
name_counts = df.groupby(["year", "name"]).size().reset_index(name="count")

# Merge counts back to the original DataFrame
df = df.merge(name_counts, on=["year", "name"], how="left")

# Assign numbers only to names that appear more than once
df["name_id"] = df.groupby(["year", "name"]).cumcount() + 1  # Start numbering from 1
df["name_id"] = df.apply(
    lambda x: f"{x['name']} {x['name_id']}" if x["count"] > 1 else x["name"],
    axis=1
)

# Drop the auxiliary count column (optional)
df.drop(columns=["count"], inplace=True)

rows_with_numbers = df[df["name_id"].str.contains(r"\d", na=False)]


### Work out if male or female

In [30]:
unique_names = df["name"].unique()


def most_common_gender(gender_list):
    # Initialize counts for 'male' and 'female'
    male_count = 0
    female_count = 0

    # Count occurrences of 'male' and 'female', ignoring 'unknown'
    for gender in gender_list:
        if gender.lower() == 'male':
            male_count += 1
        elif gender.lower() == 'female':
            female_count += 1

    # Determine which gender has the most occurrences
    if male_count > female_count:
        return 'Male'
    elif female_count > male_count:
        return 'Female'
    else:
        return 'Unknown'  # Return None if both are equal or the list has no valid entries


def guess_gender(name, names_data):
    name = name.title()
    filtered_df = names_data[names_data['Name'].str.contains(name, na=False)]

    if not filtered_df.empty:
        gender = filtered_df['Gender'].value_counts().idxmax()
        if gender == 'M':
            return 'male'
        else:
            return 'female'
    return "unknown"




# Pass the entire full name to the detector
d = gender.Detector()

# Load a CSV of names and genders
names_data = pd.read_csv(GENDER_DATA)  # Columns: Name, Gender (M/F), Count, Probability

# Function to process and estimate gender
def estimate_gender(full_name):
    # Normalize the name (capitalize each word for consistent processing)
    full_name = full_name.title()  # "CARROLL, MICHAEL" -> "Carroll, Michael"
    split_name = full_name.split(',')
    first_name = split_name[-1].strip().title()

    temp_gender1 = []
    temp_gender2 = []

    for split_first_name in first_name.split(' '):

        split_first_name = split_first_name.strip().title()
        temp_gender1.append(d.get_gender(split_first_name))
        temp_gender2.append(guess_gender(split_first_name, names_data))


    guessed_gender = most_common_gender(temp_gender1 + temp_gender2)

    # if guessed_gender == 'unknown':
        # print(full_name)
        # full_name = full_name.split(',')[-2].strip().title()
        # guessed_gender = d.get_gender(full_name)

    return guessed_gender



# Apply the function to the list of names
results = [(name, estimate_gender(name)) for name in unique_names]

name_gender_dict = dict(results)


  filtered_df = names_data[names_data['Name'].str.contains(name, na=False)]
  filtered_df = names_data[names_data['Name'].str.contains(name, na=False)]


In [31]:
# Count occurrences of each gender category
gender_counts = {
    "Male": 0,
    "Female": 0,
    "Unknown": 0
}

for _, gender_detect in results:
    if gender_detect in gender_counts:
        gender_counts[gender_detect] += 1

# Print the counts
print("Counts by Gender:")
for gender_detect, count in gender_counts.items():
    print(f"{gender_detect.capitalize()}: {count}")

Counts by Gender:
Male: 3705
Female: 5179
Unknown: 1953


### Adding sex to the dataframe

In [32]:
df['gender'] = df['name'].map(name_gender_dict)

### Ensuring salary and expenses are numeric

In [33]:
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
df['expense'] = pd.to_numeric(df['expense'], errors='coerce')

### Save to .csv

In [34]:
csv_file = DATA_PATH + 'output/salary.csv'

df.to_csv(csv_file, index=False) 

---
---

## Creating new csv with calculations

In [35]:
def get_yearly_changes(df, name_id, years):

    number_of_years = len(years)

    year_range, salary_amount_change, salary_percent_change, yearly_check = [], [], [], []
    year_range_str = ''
    salary_amount_change_num, salary_percent_change_num = 0, 0

    if number_of_years <= 1:
        return np.nan, np.nan, np.nan, np.nan
    

    for index, year in enumerate(years):

        filtered_row = df[(df['year'] == year) & (df['name_id'] == name_id)].iloc[0]

        if index + 1 < number_of_years:

            if years[index+1] - year == 1:
                year_range_str = f"{year}-{years[index+1]}"
                
                filtered_row_new = df[(df['year'] == years[index+1]) & (df['name_id'] == name_id)].iloc[0]
                salary_amount_change_num = filtered_row_new['salary'] - filtered_row['salary']
                salary_percent_change_num = (filtered_row_new['salary'] - filtered_row['salary'])/filtered_row['salary']

                year_range.append(year_range_str)
                salary_amount_change.append(salary_amount_change_num)
                salary_percent_change.append(salary_percent_change_num)
                yearly_check.append(True)

                if number_of_years <= 2:
                    break
        else:
            year_range_str = f"{years[0]}-{year}"

            filtered_row_new = df[(df['year'] == years[0]) & (df['name_id'] == name_id)].iloc[0]
            salary_amount_change_num = filtered_row['salary'] - filtered_row_new['salary']
            salary_percent_change_num = (filtered_row['salary'] - filtered_row_new['salary'])/filtered_row_new['salary']

            year_range.append(year_range_str)
            salary_amount_change.append(salary_amount_change_num)
            salary_percent_change.append(salary_percent_change_num)
            yearly_check.append(False)
        

    return year_range, salary_amount_change, salary_percent_change, yearly_check


In [36]:
years_by_id = df.groupby('name_id')['year'].apply(list).to_dict()

calc_df = pd.DataFrame({'name_id':[], 'year_range':[], 'salary_amount_change':[], 'salary_percent_change':[], 'yearly_check':[]})


# for unique_name in unique_names:
for index, (name_id, years) in enumerate(years_by_id.items()):

    if len(years) > 1:
        
        year_range, salary_amount_change, salary_percent_change, yearly_check = get_yearly_changes(df, name_id, years)

        new_row = pd.DataFrame({'name_id':[name_id]*len(year_range), 
                                'year_range':year_range, 
                                'salary_amount_change':salary_amount_change, 
                                'salary_percent_change':salary_percent_change,
                                'yearly_check':yearly_check})
        
        calc_df = pd.concat([calc_df, new_row], ignore_index=True)



### Save to .csv

In [37]:
csv_calcfile = DATA_PATH + 'output/id-calc.csv'


calc_df.to_csv(csv_calcfile, index=False) 

### Look at largest salary increases between years

In [38]:
# calc_df.nlargest(20, 'salary_amount_change')