# LGBTIQ-rights Dataset Cleaning Steps

## LGBTIQ+-Specific Datasets

**First, we will examine how many unique years and countries are included across the different datasets.**

## Python Code to Analyze Each Dataset

In [12]:
import pandas as pd

# Function to summarize unique years and countries
def summarize_years_and_countries(df, dataset_name):
    return pd.DataFrame({
        "Dataset": [dataset_name],
        "Unique Years": [df["Year"].nunique()],
        "Unique Countries": [df["Entity"].nunique()]
    })


# First Step:  Add the folder containing the new dataset

# Censorship of LGBTIQ Issues
df_censorship = pd.read_csv("../LGBTIQ-rights/censorship-of-lgbtiq-issues/censorship-of-lgbtiq-issues.csv")
censorship_summary = summarize_years_and_countries(df_censorship, "Censorship of LGBTIQ Issues")

# Employment Discrimination
df_employment_discrimination = pd.read_csv("../LGBTIQ-rights/employment-discrimination/employment-discrimination-lgbt-equaldex.csv")
employment_discrimination_summary = summarize_years_and_countries(df_employment_discrimination, "Employment Discrimination")

# Economic Inequality (Gini Index)
df_gini = pd.read_csv("../LGBTIQ-rights/economic-inequality-gini-index/economic-inequality-gini-index.csv")
gini_summary = summarize_years_and_countries(df_gini, "Economic Inequality (Gini Index)")

# GDP per Capita
df_gdp = pd.read_csv("../LGBTIQ-rights/gdp-per-capita/gdp-per-capita-worldbank.csv")
gdp_summary = summarize_years_and_countries(df_gdp, "GDP per Capita")

# Government Expenditure on Education
df_education = pd.read_csv("../LGBTIQ-rights/government-expenditure-on-education/total-government-expenditure-on-education-gdp.csv")
education_summary = summarize_years_and_countries(df_education, "Government Education Expenditure")

# Gender-Affirming Care
df_gender_care = pd.read_csv("../LGBTIQ-rights/gender-affirming-care/gender-affirming-care.csv")
gender_care_summary = summarize_years_and_countries(df_gender_care, "Gender-Affirming Care")

# Same-Sex Marriage Rights
df_marriage = pd.read_csv("../LGBTIQ-rights/marriage-same-sex-partners/marriage-same-sex-partners-equaldex.csv")
marriage_summary = summarize_years_and_countries(df_marriage, "Same-Sex Marriage")

# Legal Gender Change Rights
df_legal_gender = pd.read_csv("../LGBTIQ-rights/right-to-change-legal-gender/right-to-change-legal-gender-equaldex.csv")
legal_gender_summary = summarize_years_and_countries(df_legal_gender, "Legal Gender Change")

def summarize_countries_per_year(df, dataset_name):
    return (
        df.groupby("Year")["Entity"]
        .nunique()
        .reset_index(name="num_countries")
        .assign(Dataset=dataset_name)
    )


# Second Step: Add your dataset to this list

censorship_summary = summarize_countries_per_year(df_censorship, "Censorship of LGBTIQ Issues")
employment_summary = summarize_countries_per_year(df_employment_discrimination, "Employment Discrimination")
gini_summary = summarize_countries_per_year(df_gini, "Economic Inequality (Gini Index)")
gdp_summary = summarize_countries_per_year(df_gdp, "GDP per Capita")
education_summary = summarize_countries_per_year(df_education, "Government Education Expenditure")
gender_care_summary = summarize_countries_per_year(df_gender_care, "Gender-Affirming Care")
marriage_summary = summarize_countries_per_year(df_marriage, "Same-Sex Marriage")
legal_gender_summary = summarize_countries_per_year(df_legal_gender, "Legal Gender Change")

# Combine
summary_all = pd.concat([
    censorship_summary,
    employment_summary,
    gini_summary,
    gdp_summary,
    education_summary,
    gender_care_summary,
    marriage_summary,
    legal_gender_summary
], ignore_index=True)

# Pivot
summary_pivot = summary_all.pivot_table(
    index="Dataset",
    columns="Year",
    values="num_countries",
    fill_value=0
).reset_index()

summary_pivot


Year,Dataset,1870,1913,1937,1950,1951,1952,1953,1954,1955,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Censorship of LGBTIQ Issues,0.0,0.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,...,34.0,37.0,37.0,40.0,41.0,43.0,46.0,48.0,50.0,194.0
1,Economic Inequality (Gini Index),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,87.0,83.0,96.0,82.0,70.0,75.0,28.0,4.0,0.0,0.0
2,Employment Discrimination,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,63.0,66.0,73.0,78.0,81.0,84.0,86.0,87.0,88.0,183.0
3,GDP per Capita,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,213.0,213.0,213.0,213.0,213.0,213.0,212.0,206.0,0.0,0.0
4,Gender-Affirming Care,0.0,0.0,0.0,9.0,9.0,9.0,9.0,9.0,9.0,...,66.0,67.0,70.0,71.0,75.0,76.0,79.0,81.0,83.0,164.0
5,Government Education Expenditure,5.0,8.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,...,184.0,187.0,182.0,184.0,181.0,180.0,149.0,66.0,0.0,0.0
6,Legal Gender Change,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,87.0,91.0,95.0,98.0,98.0,98.0,102.0,103.0,103.0,194.0
7,Same-Sex Marriage,0.0,0.0,0.0,12.0,13.0,13.0,14.0,14.0,14.0,...,126.0,128.0,131.0,134.0,135.0,135.0,135.0,138.0,138.0,194.0


In [122]:
print("merged_wide_df columns:")
print(merged_wide_df.columns.tolist())

print("\ndf_censorship columns:")
print(df_censorship.columns.tolist())

print("\ndf_employment_discrimination columns:")
print(df_employment_discrimination.columns.tolist())

print("\ndf_gini columns:")
print(df_gini.columns.tolist())

print("\ndf_gdp columns:")
print(df_gdp.columns.tolist())

print("\ndf_education columns:")
print(df_education.columns.tolist())

print("\ndf_gender_care columns:")
print(df_gender_care.columns.tolist())

print("\ndf_marriage columns:")
print(df_marriage.columns.tolist())

print("\ndf_legal_gender columns:")
print(df_legal_gender.columns.tolist())


merged_wide_df columns:
['Entity', 'Year']

df_censorship columns:
['Entity', 'Code', 'Year', 'Censorship of LGBT+ issues (historical)']

df_employment_discrimination columns:
['Entity', 'Code', 'Year', 'LGBT+ employment discrimination (historical)']

df_gini columns:
['Entity', 'Code', 'Year', 'Gini coefficient', '990179-annotations']

df_gdp columns:
['Entity', 'Code', 'Year', 'GDP per capita, PPP (constant 2021 international $)']

df_education columns:
['Entity', 'Code', 'Year', 'Public spending on education as a share of GDP']

df_gender_care columns:
['Entity', 'Code', 'Year', 'Gender-affirming care (historical)']

df_marriage columns:
['Entity', 'Code', 'Year', 'Same-sex marriage (historical)']

df_legal_gender columns:
['Entity', 'Code', 'Year', 'Right to change legal gender (historical)']


In [149]:
columns_to_check = ["Year", "Entity", "Code"]

dataframes = {
    "merged_wide_df": merged_wide_df,
    "df_censorship": df_censorship,
    "df_employment_discrimination": df_employment_discrimination,
    "df_gini": df_gini,
    "df_gdp": df_gdp,
    "df_education": df_education,
    "df_gender_care": df_gender_care,
    "df_marriage": df_marriage,
    "df_legal_gender": df_legal_gender,
}

for name, df in dataframes.items():
    print(f"\n{name} - Nulls in Key Columns:")
    for col in columns_to_check:
        if col in df.columns:
            null_count = df[col].isnull().sum()
            print(f"  {col}: {null_count} null(s)")
        else:
            print(f"  {col}: Column not found")



merged_wide_df - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 772 null(s)

df_censorship - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)

df_employment_discrimination - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)

df_gini - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 133 null(s)

df_gdp - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 458 null(s)

df_education - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 428 null(s)

df_gender_care - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)

df_marriage - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)

df_legal_gender - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)


In [None]:
## Removing Code from the dataset


In [126]:
# Third step: Add your dataset here
# Combine all datasets into one long DataFrame with just Entity and Year
all_data = pd.concat([
    df_censorship[["Entity", "Code", "Year"]],
    df_employment_discrimination[["Entity","Code", "Year"]],
    df_gini[["Entity","Code", "Year"]],
    df_gdp[["Entity","Code", "Year"]],
    df_education[["Entity","Code", "Year"]],
    df_gender_care[["Entity","Code", "Year"]],
    df_marriage[["Entity","Code", "Year"]],
    df_legal_gender[["Entity","Code", "Year"]],
], ignore_index=True)


base_table = all_data.drop_duplicates().reset_index(drop=True)
base_table = base_table.sort_values(by=["Year", "Code", "Entity"]).reset_index(drop=True)
base_table = base_table.sort_values(by=["Entity", "Code",  "Year"]).reset_index(drop=True)

base_table 

Unnamed: 0,Entity,Code,Year
0,Afghanistan,AFG,1971
1,Afghanistan,AFG,1972
2,Afghanistan,AFG,1973
3,Afghanistan,AFG,1974
4,Afghanistan,AFG,1975
...,...,...,...
10977,Zimbabwe,ZWE,2021
10978,Zimbabwe,ZWE,2022
10979,Zimbabwe,ZWE,2023
10980,Zimbabwe,ZWE,2024


In [138]:
merged_wide_df = base_table.copy()

# Define datasets
datasets = {
    "censorship": df_censorship,
    "employment": df_employment_discrimination,
    "gini": df_gini,
    "gdp": df_gdp,
    "education": df_education,
    "gendercare": df_gender_care,
    "marriage": df_marriage,
    "legalgender": df_legal_gender,
}

# Join each dataset one by one (side by side)
for name, df in datasets.items():
    # Keep only value columns (not Entity and Year)
    value_cols = [col for col in df.columns if col not in ["Entity", "Year", "Code"]]

    # Rename value columns to add dataset prefix (to avoid conflicts)
    df_renamed = df.rename(columns={col: f"{col}_{name}" for col in value_cols})

    # Perform left join
    merged_wide_df = merged_wide_df.merge(
        df_renamed,
        on=["Entity", "Year", "Code"],
        how="left"
    )

# Final shape check
print("Final shape:", merged_wide_df.shape)

Final shape: (10982, 12)


In [139]:
merged_wide_df.head()

Unnamed: 0,Entity,Code,Year,Censorship of LGBT+ issues (historical)_censorship,LGBT+ employment discrimination (historical)_employment,Gini coefficient_gini,990179-annotations_gini,"GDP per capita, PPP (constant 2021 international $)_gdp",Public spending on education as a share of GDP_education,Gender-affirming care (historical)_gendercare,Same-sex marriage (historical)_marriage,Right to change legal gender (historical)_legalgender
0,Afghanistan,AFG,1971,,,,,,,,Banned,
1,Afghanistan,AFG,1972,,,,,,,,Banned,
2,Afghanistan,AFG,1973,,,,,,,,Banned,
3,Afghanistan,AFG,1974,,,,,,,,Banned,
4,Afghanistan,AFG,1975,,,,,,,,Banned,


In [140]:
print("Long format shape:", merged_wide_df.shape)


Long format shape: (10982, 12)


In [141]:
list(merged_wide_df.columns)

['Entity',
 'Code',
 'Year',
 'Censorship of LGBT+ issues (historical)_censorship',
 'LGBT+ employment discrimination (historical)_employment',
 'Gini coefficient_gini',
 '990179-annotations_gini',
 'GDP per capita, PPP (constant 2021 international $)_gdp',
 'Public spending on education as a share of GDP_education',
 'Gender-affirming care (historical)_gendercare',
 'Same-sex marriage (historical)_marriage',
 'Right to change legal gender (historical)_legalgender']

In [142]:
exclude_cols = ["Entity", "Year"]
value_cols = merged_wide_df.columns.difference(exclude_cols)
cleaned_long_df = merged_wide_df.dropna(subset=value_cols, how='all').reset_index(drop=True)
print("Shape after cleaning:", cleaned_long_df.shape)
cleaned_long_df


Shape after cleaning: (10982, 12)


Unnamed: 0,Entity,Code,Year,Censorship of LGBT+ issues (historical)_censorship,LGBT+ employment discrimination (historical)_employment,Gini coefficient_gini,990179-annotations_gini,"GDP per capita, PPP (constant 2021 international $)_gdp",Public spending on education as a share of GDP_education,Gender-affirming care (historical)_gendercare,Same-sex marriage (historical)_marriage,Right to change legal gender (historical)_legalgender
0,Afghanistan,AFG,1971,,,,,,,,Banned,
1,Afghanistan,AFG,1972,,,,,,,,Banned,
2,Afghanistan,AFG,1973,,,,,,,,Banned,
3,Afghanistan,AFG,1974,,,,,,,,Banned,
4,Afghanistan,AFG,1975,,,,,,,,Banned,
...,...,...,...,...,...,...,...,...,...,...,...,...
10977,Zimbabwe,ZWE,2021,Imprisonment as punishment,,,,3184.7854,,,Banned,
10978,Zimbabwe,ZWE,2022,Imprisonment as punishment,,,,3323.1218,,,Banned,
10979,Zimbabwe,ZWE,2023,Imprisonment as punishment,,,,3442.2512,,,Banned,
10980,Zimbabwe,ZWE,2024,Imprisonment as punishment,,,,,,,Banned,


In [143]:
cleaned_long_df.tail()

Unnamed: 0,Entity,Code,Year,Censorship of LGBT+ issues (historical)_censorship,LGBT+ employment discrimination (historical)_employment,Gini coefficient_gini,990179-annotations_gini,"GDP per capita, PPP (constant 2021 international $)_gdp",Public spending on education as a share of GDP_education,Gender-affirming care (historical)_gendercare,Same-sex marriage (historical)_marriage,Right to change legal gender (historical)_legalgender
10977,Zimbabwe,ZWE,2021,Imprisonment as punishment,,,,3184.7854,,,Banned,
10978,Zimbabwe,ZWE,2022,Imprisonment as punishment,,,,3323.1218,,,Banned,
10979,Zimbabwe,ZWE,2023,Imprisonment as punishment,,,,3442.2512,,,Banned,
10980,Zimbabwe,ZWE,2024,Imprisonment as punishment,,,,,,,Banned,
10981,Zimbabwe,ZWE,2025,Imprisonment as punishment,No protections,,,,,Restricted,Banned,Illegal


In [144]:
cleaned_long_df = cleaned_long_df.rename(columns={
    "Entity": "country",
    "Year": "year",
    "Code": "country-code",
    "Censorship of LGBT+ issues (historical)": "lgbtq-censorship",
    "LGBT+ employment discrimination (historical)": "employment-discrimination",
    "Gini coefficient": "gini-index",
    "990179-annotations": "inequality-annotations",
    "GDP per capita, PPP (constant 2021 international $)": "gdp-per-capita",
    "Public spending on education as a share of GDP": "education-spending-gdp",
    "Gender-affirming care (historical)": "gender-affirming-care",
    "Same-sex marriage (historical)": "same-sex-marriage",
    "Right to change legal gender": "legal-gender"
})


In [145]:
cleaned_long_df

Unnamed: 0,country,country-code,year,Censorship of LGBT+ issues (historical)_censorship,LGBT+ employment discrimination (historical)_employment,Gini coefficient_gini,990179-annotations_gini,"GDP per capita, PPP (constant 2021 international $)_gdp",Public spending on education as a share of GDP_education,Gender-affirming care (historical)_gendercare,Same-sex marriage (historical)_marriage,Right to change legal gender (historical)_legalgender
0,Afghanistan,AFG,1971,,,,,,,,Banned,
1,Afghanistan,AFG,1972,,,,,,,,Banned,
2,Afghanistan,AFG,1973,,,,,,,,Banned,
3,Afghanistan,AFG,1974,,,,,,,,Banned,
4,Afghanistan,AFG,1975,,,,,,,,Banned,
...,...,...,...,...,...,...,...,...,...,...,...,...
10977,Zimbabwe,ZWE,2021,Imprisonment as punishment,,,,3184.7854,,,Banned,
10978,Zimbabwe,ZWE,2022,Imprisonment as punishment,,,,3323.1218,,,Banned,
10979,Zimbabwe,ZWE,2023,Imprisonment as punishment,,,,3442.2512,,,Banned,
10980,Zimbabwe,ZWE,2024,Imprisonment as punishment,,,,,,,Banned,


In [146]:
# EDA: Non-null counts, percentages, and data types
eda_summary = pd.DataFrame({
    "Non-Null Count": cleaned_long_df.notnull().sum(),
    "Total Rows": len(cleaned_long_df),
    "Non-Null %": cleaned_long_df.notnull().mean() * 100,
    "Data Type": cleaned_long_df.dtypes
})


eda_summary = eda_summary.sort_values(by="Non-Null Count", ascending=False)
eda_summary["Non-Null %"] = eda_summary["Non-Null %"].round(1)
eda_summary.reset_index(inplace=True)
eda_summary.rename(columns={"index": "Column"}, inplace=True)


eda_summary


Unnamed: 0,Column,Non-Null Count,Total Rows,Non-Null %,Data Type
0,country,10982,10982,100.0,object
1,year,10982,10982,100.0,int64
2,country-code,10210,10982,93.0,object
3,"GDP per capita, PPP (constant 2021 internation...",7063,10982,64.3,float64
4,Public spending on education as a share of GDP...,5676,10982,51.7,float64
5,Same-sex marriage (historical)_marriage,4464,10982,40.6,object
6,Right to change legal gender (historical)_lega...,3069,10982,27.9,object
7,Gender-affirming care (historical)_gendercare,2880,10982,26.2,object
8,Gini coefficient_gini,2285,10982,20.8,float64
9,LGBT+ employment discrimination (historical)_e...,1578,10982,14.4,object


In [147]:
cleaned_long_df.to_csv("Clean_Data/../LGBTIQ-rights_clean.csv", index=False)
