In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
df = pd.read_csv('kaggle_survey_2020_responses.csv')
df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(20037, 355)

In [5]:
# Remove the top row
df = df.iloc[1:, :]

In [6]:
questions = {} 
# Key => question number
# Value => dataframe includes all answers (some has multiple answers or parts)

# let's start with keys
keys = list(dict.fromkeys([i.split('_')[0] for i in df.columns])) # Q1_Part_1

In [7]:
for key in keys:
    if key in ['Q1','Q2', 'Q3']:
        questions[key] = df[key]
    else:
        questions[key] = df[[q for q in df.columns.values if q.startswith(key)]] 

In [8]:
# let's grap different dataframe for each gender
genders = {}
for gender in df.Q2.unique():
    genders[gender] = df[df.Q2 == gender]

In [11]:
# let's look at genders we have
df.Q2.value_counts() / df.Q2.value_counts().sum()

Man                        0.788032
Woman                      0.193552
Prefer not to say          0.013126
Prefer to self-describe    0.002695
Nonbinary                  0.002595
Name: Q2, dtype: float64

In [None]:
# it looks man is the most common in the survey

In [13]:
# let's get deeper with just man and woman
df_mf = df[df.Q2.isin(['Man', 'Woman'])]
df_mf.Q2.value_counts() / df_mf.Q2.value_counts().sum()

Man      0.802817
Woman    0.197183
Name: Q2, dtype: float64

In [15]:
# let's look at gender (male, female) dist by role
fig = px.histogram(df_mf, x = 'Q4', color = 'Q2')
fig.show()

In [17]:
# let's do it but normalized by population
fig = px.histogram(df_mf, x = 'Q4', color = 'Q2', histnorm = 'probability density')
fig.show()

In [None]:
# so seems like around 42% of women have a master degree a bit more than men (39%)

In [20]:
# gender dist by country
fig = px.histogram(df_mf, x = 'Q3', color = 'Q2', histnorm = 'probability density')
fig.update_xaxes(categoryorder = 'total descending')
fig.show()