In [61]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px

# Importing file
df = pd.read_csv('multiple_choice_responses.csv')

In [62]:
# Cleaning Data

In [63]:
# only picking out columns we need
df = df[['Q1','Q2','Q3','Q4','Q5','Q10']]
df.columns = ['age','gender','country','education','job title','salary']
df = df.drop(df.index[0])
df.head()

Unnamed: 0,age,gender,country,education,job title,salary
1,22-24,Male,France,Master’s degree,Software Engineer,"30,000-39,999"
2,40-44,Male,India,Professional degree,Software Engineer,"5,000-7,499"
3,55-59,Female,Germany,Professional degree,,
4,40-44,Male,Australia,Master’s degree,Other,"250,000-299,999"
5,22-24,Male,India,Bachelor’s degree,Other,"4,000-4,999"


In [64]:
# Visualizing gender distribution

In [65]:
gender = df['gender'].value_counts()
colors = ['mediumturquoise', 'darkorange', 'gold', 'lightgreen']
trace = go.Pie(values=(np.array((gender / gender.sum())*100)),labels=gender.index)
layout = go.Layout(title='Gender')
data = [trace]
fig = go.Figure(trace,layout)
fig.update_traces(marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()
py.plot(fig,filename="gender-distribution.html")

'gender-distribution.html'

In [66]:
# age distribution:
agedist = df['age'].value_counts()
colors = ['mediumturquoise', 'darkorange', 'gold', 'lightgreen']
trace = go.Pie(values=(np.array(agedist)),labels=agedist.index)
layout = go.Layout(title='Age')
data = [trace]
fig = go.Figure(trace,layout)
fig.update_traces(marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()
py.plot(fig,filename="age-distribution.html")

'age-distribution.html'

In [67]:
# age and gender:
age_gender = df.loc[:,["age", "gender"]].groupby(["age", "gender"]).size().reset_index()
age_gender.columns = ["Age", "Gender", "Count"]
fig = px.bar(age_gender, x='Age', y='Count', color="Gender", 
             barmode='group', title ="Age Distribution by Gender", 
             height=500, width=800)
fig.update_traces(marker_line_color='rgb(9,50,100)',
                  marker_line_width=1.5, opacity=0.6)
py.plot(fig,filename="age-gender.html")
fig.show()

In [68]:
# education level
edu_count= pd.DataFrame({'education':df["education"].value_counts().index, 'counts':df["education"].value_counts().values}).sort_values("counts")
fig = px.bar(edu_count, x='education', y='counts')
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Education Level')
py.plot(fig, filename="education-level.html")
fig.show()

In [69]:
# education and gender:
import plotly.express as px
education_gender = df.loc[:,["education", "gender"]].groupby(["education", "gender"]).size().reset_index()
education_gender.columns = ["Education", "Gender", "Count"]
fig = px.bar(education_gender, x='Education', y='Count', color="Gender", 
             barmode='group', title ="Education Distribution by Gender", 
             height=500, width=800)
fig.update_traces(marker_line_color='rgb(9,50,100)',
                  marker_line_width=1.5, opacity=0.6)
fig.show()
py.plot(fig, filename="education-gender.html")

'education-gender.html'

In [70]:
# job title distribution
job_count= pd.DataFrame({'job title':df["job title"].value_counts().index, 'counts':df["job title"].value_counts().values}).sort_values("counts")
fig = px.bar(job_count, x='job title', y='counts')
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Job Title Distribution')
fig.show()
py.plot(fig, filename="job-distribution.html")

'job-distribution.html'

In [71]:
# Salary Distribution
sal_count= pd.DataFrame({'sal':df["salary"].value_counts().index, 'counts':df["salary"].value_counts().values}).sort_values("counts")
fig = px.bar(sal_count, x='sal', y='counts')
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Salary')
py.plot(fig,filename="salary-distribution.html")
fig.show()

In [72]:
# Salary Gender Distribution
sal = df.loc[:,["salary", "gender"]].groupby(["salary", "gender"]).size().reset_index()
sal.columns = ["Salary", "Gender", "Count"]
fig = px.bar(sal, x='Salary', y='Count', color="Gender", 
             barmode='group', title ="Salary Gender Distribution", 
             height=400, width=2000)
fig.update_traces(marker_line_color='rgb(9,50,100)',
                  marker_line_width=1.5, opacity=0.6)
fig.show()
py.plot(fig, filename="salary-gender.html")

'salary-gender.html'