In [None]:
import os
from enum import Enum
from pathlib import Path

import pandas as pd


class Column(Enum):
    SALARY = "salary"
    COUNTRY = "country"
    COUNTRY_CODE = "country_code"
    YEARS_OF_EXPERIENCE = "years_of_experience"
    AGE = "age"
    PROGRAMMING_LANGUAGE = "language"


paths_to_csv_by_year = {
    "2015": Path("data/raw/2015 Stack Overflow Developer Survey Responses.csv"),
    "2016": Path("data/raw/2016 Stack Overflow Survey Results/2016 Stack Overflow Survey Responses.csv"),
    "2017": Path("data/raw/stack-overflow-developer-survey-2017/survey_results_public.csv"),
    "2018": Path("data/raw/stack-overflow-developer-survey-2018/survey_results_public.csv"),
    "2019": Path("data/raw/stack-overflow-developer-survey-2019/survey_results_public.csv"),
    "2020": Path("data/raw/stack-overflow-developer-survey-2020/survey_results_public.csv"),
    "2021": Path("data/raw/stack-overflow-developer-survey-2021/survey_results_public.csv"),
    "2022": Path("data/raw/stack-overflow-developer-survey-2022/survey_results_public.csv"),
    "2023": Path("data/raw/stack-overflow-developer-survey-2023/survey_results_public.csv"),
    "2024": Path("data/raw/stack-overflow-developer-survey-2024/survey_results_public.csv"),
}

# TODO: if not found, supply URL and download them
dfs_by_year = {year: pd.read_csv(path) for year, path in paths_to_csv_by_year.items()}

  dfs_by_year = {year: pd.read_csv(path) for year, path in paths_to_csv_by_year.items()}
  dfs_by_year = {year: pd.read_csv(path) for year, path in paths_to_csv_by_year.items()}


## unify the datasets

- rename columns to uniform names
- convert years/age ranges (e.g. "20 to 30 years old") to values (using interval midpoints)
- remove rows that have missing values for salary, programming langs, country or years of experience
- approximate age using years of experience if the column is missing (in the 2017 survey)

In [57]:
column_map_2021_and_later = {
    Column.SALARY: "ConvertedCompYearly",
    Column.COUNTRY: "Country",
    Column.COUNTRY_CODE: None,
    Column.YEARS_OF_EXPERIENCE: "YearsCode",
    Column.AGE: "Age",
    Column.PROGRAMMING_LANGUAGE: "LanguageHaveWorkedWith",
}

colum_map_by_year = {
    "2016": {
        Column.SALARY: "salary_midpoint",
        Column.COUNTRY: "country",
        Column.COUNTRY_CODE: None,
        Column.YEARS_OF_EXPERIENCE: "experience_midpoint",
        Column.AGE: "age_midpoint",
        Column.PROGRAMMING_LANGUAGE: "tech_do",
    },
    "2017": {
        Column.SALARY: "Salary",
        Column.COUNTRY: "Country",
        Column.COUNTRY_CODE: None,
        Column.YEARS_OF_EXPERIENCE: "YearsProgram",
        Column.AGE: None,  # Approximated YearsProgram + 23
        Column.PROGRAMMING_LANGUAGE: "HaveWorkedLanguage"
    },
    "2018": {
        Column.SALARY: "ConvertedSalary",
        Column.COUNTRY: "Country",
        Column.COUNTRY_CODE: None,
        Column.YEARS_OF_EXPERIENCE: "YearsCoding",
        Column.AGE: "Age",
        Column.PROGRAMMING_LANGUAGE: "LanguageWorkedWith",
    },
    "2019": {
        Column.SALARY: "ConvertedComp",
        Column.COUNTRY: "Country",
        Column.COUNTRY_CODE: None,
        Column.YEARS_OF_EXPERIENCE: "YearsCode",
        Column.AGE: "Age",
        Column.PROGRAMMING_LANGUAGE: "LanguageWorkedWith",
    },
    "2020": {
        Column.SALARY: "ConvertedComp",
        Column.COUNTRY: "Country",
        Column.COUNTRY_CODE: None,
        Column.YEARS_OF_EXPERIENCE: "YearsCode",
        Column.AGE: "Age",
        Column.PROGRAMMING_LANGUAGE: "LanguageWorkedWith",
    },
    "2021": column_map_2021_and_later,
    "2022": column_map_2021_and_later,
    "2023": column_map_2021_and_later,
    "2024": column_map_2021_and_later,
}

### Convert years/age ranges to values

In [58]:
import re

def convert_years_and_age(df: pd.DataFrame):
    def convert_to_numeric(value):
        if pd.isnull(value):
            return None
        if isinstance(value, (int, float)):  # If already a number, return it
            return float(value)
        value = str(value)

        # Match patterns and convert accordingly
        if re.match(r'^\d+$', value):
            return float(value)
        elif match := re.match(r'^(\d+)\s*years?', value):
            return float(match.group(1))
        elif match := re.match(r'^(\d+)\s*to\s*(\d+)', value):
            return (float(match.group(1)) + float(match.group(2))) / 2
        elif match := re.match(r'^(\d+)[\s\-]+(\d+)', value):
            return (float(match.group(1)) + float(match.group(2))) / 2
        elif match := re.match(r'^(\d+)\s*and\s*more', value):
            return float(match.group(1)) + 1
        elif match := re.match(r'^(\d+)\s*or\s*more', value):
            return float(match.group(1)) + 1
        else:
            return None

    # Apply conversion to specified columns
    df[Column.YEARS_OF_EXPERIENCE.value] = df[Column.YEARS_OF_EXPERIENCE.value].apply(convert_to_numeric)
    df[Column.AGE.value] = df[Column.AGE.value].apply(convert_to_numeric)

    return df

### Extract uniform country codes

In [59]:
import pycountry
import country_converter as coco
import logging

country_converter = coco.CountryConverter(include_obsolete=True)

# Do not log warnings from coco (if match is not found)
logging.basicConfig(level=logging.ERROR, force=True)

# Function to get ISO Alpha-2 code
def get_country_code(country_name: str):
    try:
        # Faster but less robust pycountry matcher
        country = pycountry.countries.lookup(country_name.strip())
        return country.alpha_2
    except LookupError:
        # Fallback to coco which is more robust but slower
        country = country_converter.convert(names=country_name, to='ISO2', not_found='NOT-FOUND')
        return None if country == 'NOT-FOUND' else country


### Apply everything on the datasets and save the results

In [60]:
save_path = Path("data/extracted")
os.makedirs(save_path, exist_ok=True)

# Do not log warnings from coco (if match is not found)
logging.basicConfig(level=logging.ERROR, force=True)

dfs_normalized_by_year = {}

for year, df in dfs_by_year.items():
    if year not in colum_map_by_year:
        continue

    # Extract relevant columns
    column_map = colum_map_by_year[year]
    columns = [column_map[col] for col in Column if column_map[col] is not None]
    df = df[columns]

    # Rename columns
    df.columns = [col.value for col in Column if column_map[col] is not None]

    # Remove rows that don't have salary, language, country or years of experience
    df = df.dropna(subset=[Column.SALARY.value, Column.PROGRAMMING_LANGUAGE.value, Column.COUNTRY.value, Column.YEARS_OF_EXPERIENCE.value])


    # Year 2017 does not have age column, set values to None
    if year == "2017":
        df[Column.AGE.value] = None

    # Convert y.o.e and age from range to numeric
    df = convert_years_and_age(df)
    print(f"Extracted {len(df)} rows for year {year}")

    # Approximate age with years of experience + 23 if it's nan
    df[Column.AGE.value] = df[Column.AGE.value].fillna(df[Column.YEARS_OF_EXPERIENCE.value] + 23)

    # Standardize country names e.g. [USA, United States, ...] to country codes (US)
    df[Column.COUNTRY_CODE.value] = df[Column.COUNTRY.value].apply(get_country_code)

    # Remove rows with no country code (couldn't be matched, there's just 10-20 of them)
    df = df.dropna(subset=[Column.COUNTRY_CODE.value])

    df.to_csv(save_path / f"{year}.csv", index=False)
    dfs_normalized_by_year[year] = df


Extracted 40904 rows for year 2016
Extracted 12120 rows for year 2017


  df[Column.AGE.value] = df[Column.AGE.value].fillna(df[Column.YEARS_OF_EXPERIENCE.value] + 23)


Extracted 46467 rows for year 2018
Extracted 55537 rows for year 2019
Extracted 33333 rows for year 2020
Extracted 46329 rows for year 2021
Extracted 37891 rows for year 2022
Extracted 47820 rows for year 2023
Extracted 23309 rows for year 2024


## Extend over language column

now the `language` column contains multiple programming languages concat'd by a delimiter e.g. "Python; R; SQL". We will split the rows into multiple rows, each containing a single language.

In [61]:
from typing import Optional


def parse_language(lang: str) -> Optional[str]:
    lang = lang.strip()

    if lang in ["CSS", "SQL", "SQL Server", "MongoDB", "Node.js"]:
        # Let's be real, these are not real programming languages
        return None

    if "bash" in lang.lower() or "html" in lang.lower():
        return None
    
    return lang

def expand_by_language(df: pd.DataFrame):
    # Create an empty list to store the expanded rows
    expanded_rows = []

    # Iterate over the rows of the dataframe
    for _, row in df.iterrows():
        # Split the 'language' column by ';'
        languages = row[Column.PROGRAMMING_LANGUAGE.value].split(';')

        # For each language, create a new row with the same values but different language
        for language in languages:
            language = parse_language(language)
            if language is None:
                continue

            expanded_row = row.copy() 
            expanded_row[Column.PROGRAMMING_LANGUAGE.value] = language
            expanded_rows.append(expanded_row)

    # Convert the list of expanded rows back into a DataFrame
    expanded_df = pd.DataFrame(expanded_rows)

    return expanded_df

In [62]:
save_path = Path("data/expanded")
os.makedirs(save_path, exist_ok=True)

expanded_df_by_year = {}


for year, df in dfs_normalized_by_year.items():
    df_expanded = expand_by_language(df)

    print(f"{year}: {len(df)} rows expanded to {len(df_expanded)} rows")


    expanded_df_by_year[year] = df_expanded

    df_expanded.to_csv(save_path / f"{year}.csv", index=False)



2016: 40883 rows expanded to 150780 rows
2017: 12107 rows expanded to 38152 rows
2018: 46428 rows expanded to 180642 rows
2019: 55477 rows expanded to 189266 rows
2020: 33319 rows expanded to 113856 rows
2021: 46311 rows expanded to 166636 rows
2022: 37875 rows expanded to 137698 rows
2023: 47803 rows expanded to 180633 rows
2024: 23305 rows expanded to 90008 rows


## Subsets the data for top `n` languages

- since there are many too many unique programming languages visualizing all of them would result in cluttered graphs
- subset only the top `n` most popular languages

In [63]:
from collections import Counter

# Initialize a Counter to accumulate language occurrences across all years
language_counter = Counter()

print(len(expanded_df_by_year))

for year, df in expanded_df_by_year.items():
    # Count the occurrences of each language in the expanded dataframe
    language_counts = df[Column.PROGRAMMING_LANGUAGE.value].value_counts()
    # Update the Counter with the language counts as a dictionary
    language_counter.update(language_counts.to_dict())

# After processing all years, print the overall language counts sorted by occurrence
sorted_language_counts = language_counter.most_common()

n = 20
top_n_languages = sorted_language_counts[:n]

# Print the top n languages
for language, count in top_n_languages:
    print(f"{language}: {count}")


9
JavaScript: 229561
Python: 136597
Java: 118539
C#: 108292
TypeScript: 96040
PHP: 79343
C++: 64345
C: 52790
Go: 33922
Ruby: 29786
Kotlin: 22364
PowerShell: 20826
Swift: 20284
Rust: 19194
R: 15494
Objective-C: 15295
VBA: 13787
Scala: 12689
Assembly: 12199
Dart: 10433


Remove all irrelevant rows from the dataframes (ones that are not related to top 20 programming languages by popularity)

In [68]:

save_path = Path("data/cleaned")
os.makedirs(save_path, exist_ok=True)

top_languages_set = {lang for lang, _ in top_n_languages}

df_clean_by_year = {}

for year, df in expanded_df_by_year.items():
    # Filter out rows with languages that are not in the top n languages
    df_clean = df[df[Column.PROGRAMMING_LANGUAGE.value].isin(top_languages_set)]

    # Due to some bug in my code <0.1% of data still has some NaN values, drop them
    df_clean = df_clean.dropna()
    print(f"{year}: {len(df)} rows filtered to {len(df_clean)} rows")

    df_clean_by_year[year] = df_clean
    df_clean.to_csv(save_path / f"{year}.csv", index=False)
