# The Gender Gap in Tech: What the Data Says

A jupyter notebook to investigate the differences between women and men in the technology industry

source: https://medium.com/better-programming/the-gender-gap-in-data-science-what-the-data-says-2a74892655f1

Goal: to visually understand the inequality between gender in the technology industry

## Load in the data

In [None]:
 # imports
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
df = pd.read_csv('multiple_choice_responses.csv')

## Cleaning the data

In [1]:
# only picking out columns we need
df = df[['Q1','Q2','Q3','Q4','Q5','Q10']]
df.columns = ['age','gender','country','education','job title','salary']
df = df.drop(df.index[0])
df.head()

NameError: name 'df' is not defined

## Visualising gender distribution

In [None]:
gender = df['gender'].value_counts()
colors = ['mediumturquoise', 'darkorange', 'gold', 'lightgreen']
trace = go.Pie(values=(np.array((gender / gender.sum())*100)),labels=(np.array(temp_series.index)))
layout = go.Layout(title='Gender')
data = [trace]
fig = go.Figure(trace,layout)
fig.update_traces(marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()
py.plot(fig)

## Visualisation of age distribution

In [None]:
# age distribution:
agedist = df['age'].value_counts()
colors = ['mediumturquoise', 'darkorange', 'gold', 'lightgreen']
trace = go.Pie(values=(np.array(agedist)),labels=agedist.index)
layout = go.Layout(title='Age')
data = [trace]
fig = go.Figure(trace,layout)
fig.update_traces(marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()
py.plot(fig)

## Age distribution of all respondents with gender

In [None]:
# age and gender:
age_gender = df.loc[:,["age", "gender"]].groupby(["age", "gender"]).size().reset_index()
age_gender.columns = ["Age", "Gender", "Count"]
fig = px.bar(age_gender, x='Age', y='Count', color="Gender", 
             barmode='group', title ="Age Distribution by Gender", 
             height=500, width=800)
fig.update_traces(marker_line_color='rgb(9,50,100)',
                  marker_line_width=1.5, opacity=0.6)
fig.show()

## Education level

In [None]:
# education level
edu_count= pd.DataFrame({'education':df["education"].value_counts().index, 'counts':df["education"].value_counts().values}).sort_values("counts")
fig = px.bar(edu_count, x='education', y='counts')
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Education Level')
py.plot(fig, filename="edu level")
fig.show()

## Education level and gender

In [None]:
# education and gender:
import plotly.express as px
education_gender = df.loc[:,["education", "gender"]].groupby(["education", "gender"]).size().reset_index()
education_gender.columns = ["Education", "Gender", "Count"]
fig = px.bar(education_gender, x='Education', y='Count', color="Gender", 
             barmode='group', title ="Education Distribution by Gender", 
             height=500, width=800)
fig.update_traces(marker_line_color='rgb(9,50,100)',
                  marker_line_width=1.5, opacity=0.6)
fig.show()

## Job title distribution

In [None]:

# job title
job_count= pd.DataFrame({'job title':df["job title"].value_counts().index, 'counts':df["job title"].value_counts().values}).sort_values("counts")
fig = px.bar(job_count, x='job title', y='counts')
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Job Title')
py.plot(fig, filename="job")
fig.show()

## Salary Distribution

In [None]:

sal_count= pd.DataFrame({'sal':df["salary"].value_counts().index, 'counts':df["salary"].value_counts().values}).sort_values("counts")
fig = px.bar(sal_count, x='sal', y='counts')
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Salary')
py.plot(fig)
fig.show()

## Salary Gender Distribution

In [None]:
sal = df.loc[:,["salary", "gender"]].groupby(["salary", "gender"]).size().reset_index()
sal.columns = ["Salary", "Gender", "Count"]
fig = px.bar(sal, x='Salary', y='Count', color="Gender", 
             barmode='group', title ="Salary Gender Distribution", 
             height=400, width=2000)
fig.update_traces(marker_line_color='rgb(9,50,100)',
                  marker_line_width=1.5, opacity=0.6)
fig.show()