In [1]:
import pandas as pd
import os
import numpy as np
from plotly import graph_objects as po

import udacity_data_science_blog.utils as utils

In [2]:
data = {
    year: pd.read_csv(f"{os.getcwd()}/data/{year}.csv") for year in range(2013, 2025)
}

  year: pd.read_csv(f"{os.getcwd()}/data/{year}.csv") for year in range(2013, 2025)
  year: pd.read_csv(f"{os.getcwd()}/data/{year}.csv") for year in range(2013, 2025)
  year: pd.read_csv(f"{os.getcwd()}/data/{year}.csv") for year in range(2013, 2025)


### Programming languages

In [10]:
utils.search_for_columns_with_keyword(data=data, keywords=["language"])

{2013: ['Which of the following languages or technologies have you used significantly in the past year?'],
 2014: ['Which of the following languages or technologies have you used significantly in the past year?'],
 2017: ['HaveWorkedLanguage', 'WantWorkLanguage'],
 2018: ['LanguageWorkedWith', 'LanguageDesireNextYear'],
 2019: ['LanguageWorkedWith', 'LanguageDesireNextYear'],
 2020: ['LanguageDesireNextYear', 'LanguageWorkedWith'],
 2021: ['LanguageHaveWorkedWith', 'LanguageWantToWorkWith'],
 2022: ['LanguageHaveWorkedWith', 'LanguageWantToWorkWith'],
 2023: ['LanguageHaveWorkedWith', 'LanguageWantToWorkWith'],
 2024: ['LanguageHaveWorkedWith', 'LanguageWantToWorkWith', 'LanguageAdmired']}

In [None]:
utils.search_for_columns_with_keyword(data=data, keywords=["branch"])

{2019: ['MainBranch'],
 2020: ['MainBranch'],
 2021: ['MainBranch'],
 2022: ['MainBranch', 'TBranch'],
 2023: ['MainBranch', 'TBranch'],
 2024: ['MainBranch', 'TBranch']}

In [5]:
language_column_map = {
    2019: "LanguageWorkedWith",
    2020: "LanguageWorkedWith",
    2021: "LanguageHaveWorkedWith",
    2022: "LanguageHaveWorkedWith",
    2023: "LanguageHaveWorkedWith",
    2024: "LanguageHaveWorkedWith",
}

branch_category_map = {
    "I am a developer by profession": "Professional developer",
    "I am not primarily a developer, but I write code sometimes as part of my work": "Part of work",
    "I am not primarily a developer, but I write code sometimes as part of my work/studies": "Part of work",
    "I code primarily as a hobby": "Learning/Hobby",
    "I am learning to code": "Learning/Hobby",
    "I am a student who is learning to code": "Learning/Hobby",
    "I used to be a developer by profession, but no longer am": "Former developer",
}

top_5_languages = ["HTML/CSS", "JavaScript", "SQL", "Java", "Python"]
all_language_data = pd.DataFrame()

for year in [2019, 2020, 2021, 2022, 2023, 2024]:
    language_data = data[year][[language_column_map[year], "MainBranch"]].dropna()
    language_data.rename(
        columns={
            language_column_map[year]: "Language",
            "MainBranch": "Branch",
        },
        inplace=True,
    )
    language_data["Branch"] = language_data["Branch"].replace(branch_category_map)
    language_data = language_data[
        (language_data["Branch"] == "Professional developer")
        | (language_data["Branch"] == "Learning/Hobby")
    ]
    language_data["Language"] = language_data["Language"].str.split(";")
    language_data = language_data.explode("Language").reset_index(drop=True)
    language_data = (
        language_data.groupby(by=["Branch", "Language"])
        .size()
        .reset_index(name="Count")
    )
    language_data["Total"] = language_data.groupby(by=["Branch"])["Count"].transform(
        "sum"
    )
    language_data["Proportion"] = language_data["Count"] / language_data["Total"]

    language_data = language_data[language_data["Language"].isin(top_5_languages)]

    language_data["Year"] = year
    all_language_data = pd.concat([all_language_data, language_data]).reset_index(
        drop=True
    )

In [8]:
fig_learning = po.Figure()

for language in top_5_languages:
    language_data = all_language_data[all_language_data["Branch"] == "Learning/Hobby"][
        all_language_data[all_language_data["Branch"] == "Learning/Hobby"]["Language"]
        == language
    ]
    fig_learning.add_trace(
        po.Scatter(
            x=language_data["Year"],
            y=language_data["Proportion"],
            mode="lines+markers",
            name=language,
            text=language_data["Language"],
        )
    )

fig_learning.update_layout(
    title="Trend of Programming Languages Used by Learning/Hobby Developers",
    xaxis_title="Year",
    yaxis_title="Proportion",
    legend_title="Languages",
)

fig_pro = po.Figure()

for language in top_5_languages:
    language_data = all_language_data[
        all_language_data["Branch"] == "Professional developer"
    ][
        all_language_data[all_language_data["Branch"] == "Professional developer"][
            "Language"
        ]
        == language
    ]
    fig_pro.add_trace(
        po.Scatter(
            x=language_data["Year"],
            y=language_data["Proportion"],
            mode="lines+markers",
            name=language,
            text=language_data["Language"],
        )
    )

fig_pro.update_layout(
    title="Trend of Programming Languages Used by Professional Developers",
    xaxis_title="Year",
    yaxis_title="Proportion",
    legend_title="Languages",
)


fig_learning.show()
fig_pro.show()

In [9]:
fig_learning.write_image("images/languages_learning.png")
fig_pro.write_image("images/languages_pro.png")