Gender Assessment data cleaning 

In [1]:
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv("../data/raw/gender-assessment/gender_assessment.csv")

In [3]:
# Inspect the data
print(df.info())
print(f"Initial number of rows: {len(df)}") 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 79 columns):
 #   Column                                                                                                                                                                                                                                                                                                                                                               Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                                                                                                                                                               --------------  -----  
 0   WBA ID                                                                                                              

In [4]:
# Choosing only indicator scores and removing element scores
#Drop columns with values only in ['Met', 'Unmet', 'Partially Met']
columns_to_drop = []
for col in df.columns:
    unique_vals = df[col].dropna().unique()
    if all(val in ['Met', 'Unmet', 'Partially Met'] for val in unique_vals):
        columns_to_drop.append(col)

#WBA ID and ISIN are not required in the analysis so removing
columns_to_drop.extend(['WBA ID', 'ISIN'])
                       
df = df.drop(columns=columns_to_drop)

In [5]:
# Drop rows with missing values in critical columns
df = df.dropna(subset=["Company Name ", "HQ Country", "Overall Gender Assessment Score"])

In [6]:
#  Clean column names (convert to lowercase and replace spaces with underscores)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [7]:
# Rename long column names to shorter ones
rename_map = {
    'company_name': 'company',
    'hq_country': 'country',
    'hq_region': 'region',
    'wba_industry': 'industry',
    'year_assessed': 'year',
    'overall_gender_assessment_score': 'score',
    'percentage_of_total_possible_score_\n(out_of_52.3)': 'percent_score',
    "a01._strategic_action": "strategic_action",
    "a02._gender_targets": "gender_targets",
    "a04._gender-responsive_human_rights_due_diligence_process": "gender_due_diligence",
    "a05._grievance_mechanisms": "grievance_mechanisms",
    "a06._stakeholder_engagement": "stakeholder_engagement",
    "a07._corrective_action_process": "corrective_action",
    "b01._gender_equality_in_leadership": "gender_leadership",
    "b02._professional_development_and_recruitment": "development_recruitment",
    "b03._sex-disaggregated_employee_data": "employee_data_by_sex",
    "b04._gender_equality_leadership_in_the_supply_chain": "supply_chain_gender_leadership",
    "b06._enabling_environment_for_freedom_of_association_and_collective_bargaining": "enabling_environment_union_rights",
    "b07._gender-responsive_procurement": "gender_procurement",
    "c01._gender_pay_gap": "gender_pay_gap",
    "c02._paid_primary_and_secondary_carer_leave": "carer_leave_paid",
    'c03._childcare_and_other_family_support': 'childcare_support',
    'c04._flexible_work': 'flex_work',
    'c06._living_wage_in_the_supply_chain': 'living_wage_supply_chain',
    'd01._health,_safety_and_well-being_in_the_workplace': 'health_safety',
    'd02._safe_and_healthy_work_in_the_supply_chain': 'health_safety_supply_chain',
    'e01._violence_and_harassment_prevention': 'violence_prevention',
    'e02._violence_and_harassment_remediation': 'violence_remediation'
}

# Apply renaming
df = df.rename(columns=rename_map)

In [8]:
# Ensure 'score' and 'percent_score' are numeric
df['score'] = pd.to_numeric(df['score'], errors='coerce')
df['percent_score'] = pd.to_numeric(df['percent_score'], errors='coerce')

In [9]:
# Remove duplicates
df = df.drop_duplicates()

In [10]:
# Save cleaned file
df.to_csv("../data/clean/gender_assessment_clean.csv", index=False)

In [11]:
# Validate cleaned data
clean_data= pd.read_csv("../data/clean/gender_assessment_clean.csv")
print(f"Final cleaned dataset rows: {len(clean_data)}")  # Final row count
clean_data.head()

Final cleaned dataset rows: 2000


Unnamed: 0,company,country,region,industry,ownership,year,score,percent_score,strategic_action,gender_targets,...,gender_procurement,gender_pay_gap,carer_leave_paid,childcare_support,flex_work,living_wage_supply_chain,health_safety,health_safety_supply_chain,violence_prevention,violence_remediation
0,3M,United States,North America,Chemicals,Public,2023,11.3,22,1,0,...,1,0,0.0,0,2,0,1.0,2,1.0,0.0
1,Asos,United Kingdom,Europe & Central Asia,Apparel & Footwear,Public,2023,16.9,32,1,0,...,0,0,0.0,1,1,2,0.5,2,0.5,0.0
2,A.P. Moller - Maersk,Denmark,Europe & Central Asia,Freight & logistics,Public,2024,10.9,21,1,1,...,0,0,0.0,0,0,0,1.0,2,1.0,0.0
3,ABB,Switzerland,Europe & Central Asia,Capital Goods,Public,2023,12.8,25,1,1,...,0,0,1.0,0,0,0,1.0,2,1.0,0.0
4,AbbVie,United States,North America,Pharmaceuticals & Biotechnology,Public,2023,15.4,30,1,0,...,1,0,0.0,2,1,0,1.0,2,1.0,0.0
