In [1]:
import pandas as pd
import numpy as np

In [2]:
pre = pd.read_csv("~/storage/data/physicists/pyhep2020-preworkshop-survey.csv")
post = pd.read_csv("~/storage/data/physicists/pyhep2020-postworkshop-survey.csv")

In [3]:
pre.columns.tolist()

['Submitter',
 'Submitter Email',
 'Submission Date',
 'PyHEP feedback: Atlantic: 15:00 CET, 06:00 PDT, 18:30 IST, 22:00 JST',
 'PyHEP feedback: Pacific: 00:00 CET, 15:00 PDT, 03:30 IST, 07:00 JST',
 'PyHEP feedback: Indian Ocean: 09:00 CET, 00:00 PDT, 12:30 IST, 16:00 JST',
 'PyHEP feedback: In what country do you currently reside?',
 'PyHEP feedback: How did you hear about this workshop?',
 'PyHEP feedback: Do you want to clarify how you found out about this workshop?',
 'PyHEP feedback: What are you hoping to learn from this workshop?',
 'PyHEP feedback: Do you want to clarify what you are hoping to learn from this workshop?',
 'Personal demographics: What best describes your gender?',
 'Personal demographics: What is your country of origin?',
 'Personal demographics: If you like, you may describe your race or ethnicity here.',
 'Professional life: What best describes your occupation?',
 'Professional life: Do you want to clarify your occupation?',
 'Professional life: What best des

In [4]:
post.columns.tolist()

['Submitter',
 'Submitter Email',
 'Submission Date',
 'Correlation with the pre-workshop survey: Did you submit a response to the pre-workshop survey?',
 'Correlation with the pre-workshop survey: In what country or continent do you currently reside?',
 'Correlation with the pre-workshop survey: What best describes the stage of your professional career?',
 'Correlation with the pre-workshop survey: What operating system(s) run on the computer you most often use for work? Check all that apply.',
 'Correlation with the pre-workshop survey: Which text editors/IDEs do you use regularly (i.e. more than 10% of your work)?',
 'PyHEP 2020 content: How would you rate the level of the talks, relative to your expectations?',
 'PyHEP 2020 content: Which talks/topics were presented in a way that was too basic?',
 'PyHEP 2020 content: Which talks/topics were presented in a way that was too advanced?',
 'PyHEP 2020 content: How would you rate the variety of topics, relative to your expectations?',
 

In [5]:
post["try_correlation"] = (post['Correlation with the pre-workshop survey: Did you submit a response to the pre-workshop survey?'] == "Yes")

In [6]:
country_mapping_for_correlation = lambda x: {
    "Australia": "Anywhere else in the world",
    "Austria": "Anywhere else in Europe (including Russia)",
    "Belarus": "Anywhere else in Europe (including Russia)",
    "Belgium": "Anywhere else in Europe (including Russia)",
    "Brasil": "South America",
    "Brazil": "South America",
    "CH": "Switzerland",
    "Canada": "Anywhere else in North America",
    "Canada (Montreal)": "Anywhere else in North America",
    "Canada (east)": "Anywhere else in North America",
    "China": "Anywhere else in Asia",
    "Colombia": "South America",
    "Czech Rep.": "Anywhere else in Europe (including Russia)",
    "Czech Republic": "Anywhere else in Europe (including Russia)",
    "Czech republic": "Anywhere else in Europe (including Russia)",
    "Czechia": "Anywhere else in Europe (including Russia)",
    "Denmark": "Anywhere else in Europe (including Russia)",
    "EGYPT": "Anywhere else in the world",
    "Ecuador": "South America",
    "Egypt": "Anywhere else in the world",
    "Finland": "Anywhere else in Europe (including Russia)",
    "France": "France",
    "France (CERN-based)": "France",
    "Germany": "Germany",
    "Greece": "Anywhere else in Europe (including Russia)",
    "Greeve": "Anywhere else in Europe (including Russia)",
    "Honduras": "South America",
    "Hungary": "Anywhere else in Europe (including Russia)",
    "INDIA": "India",
    "India": "India",
    "Indonesia": "Anywhere else in Asia",
    "Iran": "Anywhere else in Asia",
    "Italy": "Italy",
    "Japan": "Anywhere else in Asia",
    "Kuwait": "Anywhere else in the world",
    "MEXICO": "Anywhere else in North America",
    "Mexico": "Anywhere else in North America",
    "México": "Anywhere else in North America",
    "Netherlands": "Anywhere else in Europe (including Russia)",
    "Netherlands. Time slot also dependent on another conference. So need to be able to attend both.": "Anywhere else in Europe (including Russia)",
    "Oman": "Anywhere else in the world",
    "Osaka, Japan": "Anywhere else in Asia",
    "PHILIPPINES": "Anywhere else in Asia",
    "Pakistan": "Anywhere else in the world",
    "Peru": "South America",
    "Philippines": "Anywhere else in Asia",
    "Poland": "Anywhere else in Europe (including Russia)",
    "Portugal": "Anywhere else in Europe (including Russia)",
    "Republic of Korea": "Anywhere else in Asia",
    "Romania": "Anywhere else in Europe (including Russia)",
    "Russia": "Anywhere else in Europe (including Russia)",
    "Russia Federation": "Anywhere else in Europe (including Russia)",
    "S.KOREA": "Anywhere else in Asia",
    "Slovenia": "Anywhere else in Europe (including Russia)",
    "South Africa": "Anywhere else in the world",
    "South Korea": "Anywhere else in Asia",
    "Spain": "Anywhere else in Europe (including Russia)",
    "Stockholm": "Anywhere else in Europe (including Russia)",
    "Switzerland": "Switzerland",
    "Taiwan": "Anywhere else in Asia",
    "Taiwan (R.O.C.)": "Anywhere else in Asia",
    "Thailand": "Anywhere else in Asia",
    "The Netherlands": "Anywhere else in Europe (including Russia)",
    "U.S.": "United States",
    "U.S.A.": "United States",
    "UK": "United Kingdom",
    "US": "United States",
    "US (EST)": "United States",
    "USA": "United States",
    "USA (Chicago)": "United States",
    "USA - Michigan": "United States",
    "Ukraine": "Anywhere else in Europe (including Russia)",
    "United Kingdom": "United Kingdom",
    "United State": "United States",
    "United States": "United States",
    "United States of America": "United States",
    "United Status": "United States",
    "france": "France",
    "greece": "Anywhere else in Europe (including Russia)",
    "india": "India",
    "iran": "Anywhere else in the world",
    "italy": "Italy",
    "morocco": "Anywhere else in the world",
    "poland": "Anywhere else in Europe (including Russia)",
    "switzerland": "Switzerland",
    "the U.S": "United States",
}.get(x, " ")
pre["correlate_country"] = pre[['PyHEP feedback: In what country do you currently reside?']].fillna("").applymap(country_mapping_for_correlation).iloc[:, 0]
post["correlate_country"] = post['Correlation with the pre-workshop survey: In what country or continent do you currently reside?'].fillna("")

In [7]:
professional_stage_mapping_for_correlation = lambda x: {
    "Early grad student (mostly taking courses)": "Student",
    "Grad student involved in research": "Student",
    "High school student": "Student",
    "Undergraduate student": "Student",
    "Postdoc/fellow/temporary research position": "Postdoc/fellow/temporary research position",
    "Professor at a college or university": "Professor at a college or university",
    "In a career outside of particle physics": "Other",
    "Other, not listed above": "Other",
    "Research or management at a laboratory/college/university": "Other",
    "Retired/emeritus": "Other",
}.get(x, " ")
pre["correlate_professional_stage"] = pre[['Professional life: What best describes the stage of your professional career?']].fillna("").applymap(professional_stage_mapping_for_correlation).iloc[:, 0]
post["correlate_professional_stage"] = post['Correlation with the pre-workshop survey: What best describes the stage of your professional career?'].fillna("")

In [8]:
operating_system_mapping_for_correlation = lambda x: {
    "Linux": "Linux",
    "Linux; None (e.g. VT520)": "Linux",
    "Linux; Other UNIX-like (e.g. FreeBSD)": "Linux",
    "MacOS": "MacOS",
    "MacOS; Linux": "MacOS; Linux",
    "Windows": "Windows",
    "Windows; Linux": "Windows; Linux",
    "Windows; Linux; I don't know": "Windows; Linux",
    "Windows; MacOS": "Windows; MacOS",
    "Windows; MacOS; Linux": "Windows; MacOS; Linux",
    "Windows; MacOS; Linux; Other UNIX-like (e.g. FreeBSD)": "Windows; MacOS; Linux",
}.get(x, " ")
pre["correlate_operating_system"] = pre[['Computing and programming: What operating system(s) does it run?']].fillna("").applymap(operating_system_mapping_for_correlation).iloc[:, 0]
post["correlate_operating_system"] = post['Correlation with the pre-workshop survey: What operating system(s) run on the computer you most often use for work? Check all that apply.'].fillna("")

In [9]:
editor_order = list(map(lambda x: x[0], sorted(zip(*np.unique(pre[[
    'Computing and programming: Which text editors/IDEs do you use regularly (i.e. more than 10% of your work)?'
]].fillna("").applymap(lambda x: [y if y in [
    "Atom",
    "Emacs (or variant: XEmacs, Aquamacs, Carbon Emacs)",
    "Jupyter/JupyterLab",
    "Notepad or Notepad++",
    "pico or nano",
    "PyCharm",
    "Sublime Text",
    "Vi or Vim",
    "Visual Studio",
    "VSCode",
    "XCode",
] else "Other" for y in x.split("; ")]).explode(
    'Computing and programming: Which text editors/IDEs do you use regularly (i.e. more than 10% of your work)?'
).iloc[:, 0], return_counts=True)), key=lambda x: x[1])))[::-1] + [""]
editor_order

['Vi or Vim',
 'Jupyter/JupyterLab',
 'Emacs (or variant: XEmacs, Aquamacs, Carbon Emacs)',
 'VSCode',
 'Other',
 'Sublime Text',
 'Atom',
 'PyCharm',
 'Notepad or Notepad++',
 'pico or nano',
 'Visual Studio',
 'XCode',
 '']

In [10]:
editor_mapping_for_correlation = lambda x: " " if x == "" else "; ".join(sorted(
    [y if y in [
        "Atom",
        "Emacs (or variant: XEmacs, Aquamacs, Carbon Emacs)",
        "Jupyter/JupyterLab",
        "Notepad or Notepad++",
        "pico or nano",
        "PyCharm",
        "Sublime Text",
        "Vi or Vim",
        "Visual Studio",
        "VSCode",
        "XCode",
    ] else "Other" for y in x.split("; ")],
    key=lambda z: editor_order.index(z)
))
pre["correlate_editor"] = pre[['Computing and programming: Which text editors/IDEs do you use regularly (i.e. more than 10% of your work)?']].fillna("").applymap(editor_mapping_for_correlation).iloc[:, 0]
editor_mapping_for_correlation2 = lambda x: "; ".join(sorted([y for y in x.split("; ")], key=lambda z: editor_order.index(z)))
post["correlate_editor"] = post[['Correlation with the pre-workshop survey: Which text editors/IDEs do you use regularly (i.e. more than 10% of your work)?']].fillna("").applymap(editor_mapping_for_correlation2).iloc[:, 0]

In [52]:
# pre_indexed = pre.set_index(["correlate_country", "correlate_professional_stage", "correlate_operating_system", "correlate_editor"])
# post_indexed = post[post["try_correlation"]].set_index(["correlate_country", "correlate_professional_stage", "correlate_operating_system", "correlate_editor"])

pre_indexed = pre.set_index(["correlate_country", "correlate_professional_stage", "correlate_operating_system"])
post_indexed = post[post["try_correlation"]].set_index(["correlate_country", "correlate_professional_stage", "correlate_operating_system"])

In [53]:
correlated = pd.merge(pre_indexed, post_indexed, how="right", left_index=True, right_index=True)
correlated = correlated[~pd.isna(correlated['PyHEP feedback: In what country do you currently reside?'])]
correlated = correlated[~pd.isna(correlated['Professional life: What best describes the stage of your professional career?'])]
correlated = correlated[~pd.isna(correlated['Computing and programming: What operating system(s) does it run?'])]
# correlated = correlated[~pd.isna(correlated['Computing and programming: Which text editors/IDEs do you use regularly (i.e. more than 10% of your work)?'])]

In [54]:
def editor_mapping_overlap(row):
    x = set(row["correlate_editor_x"].split("; "))
    y = set(row["correlate_editor_y"].split("; "))
    x.discard("")
    x.discard(" ")
    y.discard("")
    y.discard(" ")
    return len(x.intersection(y)) != 0

correlated = correlated[correlated[["correlate_editor_x", "correlate_editor_y"]].apply(editor_mapping_overlap, axis=1)]

In [55]:
len(post), len(post_indexed), len(correlated)

(179, 151, 326)

In [57]:
correlated[["correlate_editor_x", "correlate_editor_y"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,correlate_editor_x,correlate_editor_y
correlate_country,correlate_professional_stage,correlate_operating_system,Unnamed: 3_level_1,Unnamed: 4_level_1
Anywhere else in Asia,Postdoc/fellow/temporary research position,Windows; Linux,"Jupyter/JupyterLab; Emacs (or variant: XEmacs,...",Vi or Vim; Jupyter/JupyterLab
Anywhere else in Asia,Postdoc/fellow/temporary research position,Windows; Linux,"Jupyter/JupyterLab; Emacs (or variant: XEmacs,...","Jupyter/JupyterLab; Emacs (or variant: XEmacs,..."
Anywhere else in Asia,Professor at a college or university,Linux,Vi or Vim,Vi or Vim
Anywhere else in Asia,Student,Linux,Vi or Vim,Vi or Vim
Anywhere else in Asia,Student,Linux,Vi or Vim; Atom,Vi or Vim
...,...,...,...,...
United States,Student,Windows; Linux,"Jupyter/JupyterLab; Emacs (or variant: XEmacs,...","Emacs (or variant: XEmacs, Aquamacs, Carbon Em..."
United States,Student,Windows; Linux,Vi or Vim; Notepad or Notepad++,"Emacs (or variant: XEmacs, Aquamacs, Carbon Em..."
United States,Student,Windows; Linux,"Jupyter/JupyterLab; Emacs (or variant: XEmacs,...","Emacs (or variant: XEmacs, Aquamacs, Carbon Em..."
United States,Student,Windows; Linux,Visual Studio,Visual Studio


In [None]:
# correlated[[
#     "correlate_country",
#     'PyHEP feedback: In what country do you currently reside?',
#     'Correlation with the pre-workshop survey: In what country or continent do you currently reside?',
# ]]

In [None]:
# correlated[[
#     "correlate_professional_stage",
#     'Professional life: What best describes the stage of your professional career?',
#     'Correlation with the pre-workshop survey: What best describes the stage of your professional career?',
# ]]

In [None]:
# correlated[[
#     "correlate_operating_system",
#     'Computing and programming: What operating system(s) does it run?',
#     'Correlation with the pre-workshop survey: What operating system(s) run on the computer you most often use for work? Check all that apply.'
# ]]

In [None]:
correlated[[
    "correlate_editor_x",
    'correlate_editor_y'
]]