Import the necessary modules and stackoverflow csv files for 2019.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from util_functions import *
%matplotlib inline

df = pd.read_csv('./2019/survey_results_public.csv')
schema = pd.read_csv('./2019/survey_results_schema.csv')

Give a quick view of the survey results:

In [None]:
df.head()

Give a quick view of the survey schema:

In [None]:
schema.head()

Lets take a look at the stats for this data set to get a feel for what we are looking at:

In [None]:
df.shape

Lets see if we have any columns that do NOT have missing data (NaN values).  This will be a much smaller list compared to showing the rows that have missing values:

In [None]:
no_nulls = set(df.columns[df.isnull().mean()==0]) #Set of columns with 0 missing values.
no_nulls

In [None]:
most_missing_cols = df.columns[df.isnull().mean() > 0.25] #Set of columns with more than 25% of the values missing
most_missing_cols

Lets have a look at the DevType question we wish to drill into to find respondants that work in Data Science

In [None]:
get_description('DevType', schema)

Lets take a look at a breakdown of the number of people that classify themselves doing particular jobs.  Keep in mind that when answering the question someone can pick multiple roles.

In [None]:
list_values = count_lists(df, col='DevType', delim=';')
props_df = clean_and_plot(df, list_values, col='DevType', title='Other Roles')

Lets focus on the rows for DevTypes that consider their job or part of their job a "Data scientist or machine learning specialist".  This will leave us with 6460 rows to work with. We will also remove the null values prior to this.

In [None]:
df = df[df.DevType.notnull()]
df_origin = df[df['DevType'].str.contains('Data scientist or machine learning specialist') == False]
df = df[df['DevType'].str.contains('Data scientist or machine learning specialist')]
print(df['DevType'].shape, df_origin.shape)

In [None]:
We want to see what the education levels are for those in the Data Science/ML field(s):

In [None]:
chart_values = df.EdLevel.value_counts()
(chart_values/df.shape[0]).plot(kind="bar");
plt.title("DS/ML Education Levels");

In [None]:
chart_values = df_origin.EdLevel.value_counts()
(chart_values/df_origin.shape[0]).plot(kind="bar");
plt.title("Other Education Levels");

In [None]:
Lets also have a look to see what the majors were for those in the Data Science/ML field(s):

In [None]:
chart_values = df.UndergradMajor.value_counts() 
(chart_values/df.shape[0]).plot(kind="bar");
plt.title("Undergraduate Major");

In [None]:
chart_values = df_origin.UndergradMajor.value_counts() 
(chart_values/df_origin.shape[0]).plot(kind="bar");
plt.title("Undergraduate Major");

In [None]:
Out of curiosity, lets take a look at the average compensation (converted to USD) and average hours worked per week:

In [None]:
comp = df.ConvertedComp.mean()
hours = df.WorkWeekHrs.mean()

print("Compensation: ${:,.2f}\nHours: {}".format(comp, round(hours,1)))

In [None]:
comp = df_origin.ConvertedComp.mean()
hours = df_origin.WorkWeekHrs.mean()

print("Compensation: ${:,.2f}\nHours: {}".format(comp, round(hours,1)))

Lets take a look at the amount of time people in the DS/ML fields spend remotely working:

In [None]:
chart_values = df.WorkRemote.value_counts() 
(chart_values/df.shape[0]).plot(kind="bar");
plt.title("Remote Work Hours");

In [None]:
chart_values = df_origin.WorkRemote.value_counts() 
(chart_values/df_origin.shape[0]).plot(kind="bar");
plt.title("Remote Work Hours");

We are curious about the location people work to see if it lines up with the reported hours worked remotely.  

In [None]:
chart_values = df.WorkLoc.value_counts() 
(chart_values/df.shape[0]).plot(kind="bar");
plt.title("Work location");

In [None]:
chart_values = df_origin.WorkLoc.value_counts() 
(chart_values/df_origin.shape[0]).plot(kind="bar");
plt.title("Work location");

In [None]:
chart_values = df.Age.value_counts(bins=10) 
(chart_values/df.shape[0]).plot(kind="bar");
plt.title("Age Groups");

In [None]:
chart_values = df_origin.Age.value_counts(bins=10) 
(chart_values/df_origin.shape[0]).plot(kind="bar");
plt.title("Age Groups");

In [None]:
df.Age.mean() # What is the average age for people in the DS/ML field(s)

In [None]:
df_origin.Age.mean() # What is the average age for people in the DS/ML field(s)

In [None]:
chart_values = df.Gender.value_counts() 
(chart_values/df.shape[0]).plot(kind="bar");
plt.title("Gender Groups");

In [None]:
chart_values = df_origin.Gender.value_counts() 
(chart_values/df_origin.shape[0]).plot(kind="bar");
plt.title("Gender Groups");

In [None]:
list_values = count_lists(df, col='LanguageWorkedWith', delim=';')
props_df = clean_and_plot(df, list_values, col='LanguageWorkedWith', title='Most Common Current Languages Used')

In [None]:
list_values = count_lists(df_origin, col='LanguageWorkedWith', delim=';')
props_df = clean_and_plot(df_origin, list_values, col='LanguageWorkedWith', title='Most Common Current Languages Used')

In [None]:
list_values = count_lists(df, col='LanguageDesireNextYear', delim=';')
props_df = clean_and_plot(df, list_values, col='LanguageDesireNextYear', title='Most Desired Languages to Learn in 2020')

In [None]:
list_values = count_lists(df_origin, col='LanguageDesireNextYear', delim=';')
props_df = clean_and_plot(df_origin, list_values, col='LanguageDesireNextYear', title='Most Desired Languages to Learn in 2020')

In [None]:
list_values = count_lists(df, col='DatabaseWorkedWith', delim=';')
props_df = clean_and_plot(df, list_values, col='DatabaseWorkedWith', title='Most Common Current DBs Used')

In [None]:
list_values = count_lists(df_origin, col='DatabaseWorkedWith', delim=';')
props_df = clean_and_plot(df_origin, list_values, col='DatabaseWorkedWith', title='Most Common Current DBs Used')

In [None]:
list_values = count_lists(df, col='DatabaseDesireNextYear', delim=';')
props_df = clean_and_plot(df, list_values, col='DatabaseDesireNextYear', title='Most Common Current DBs Used')

In [None]:
list_values = count_lists(df_origin, col='DatabaseDesireNextYear', delim=';')
props_df = clean_and_plot(df_origin, list_values, col='DatabaseDesireNextYear', title='Most Common Current DBs Used')

In [None]:
chart_values = df.OpSys.value_counts() 
(chart_values/df.shape[0]).plot(kind="bar");
plt.title("Primary Operating System Used");

In [None]:
chart_values = df_origin.OpSys.value_counts() 
(chart_values/df_origin.shape[0]).plot(kind="bar");
plt.title("Primary Operating System Used");

In [None]:
list_values = count_lists(df, col='DevEnviron', delim=';')
props_df = clean_and_plot(df, list_values, col='DevEnviron', title='Dev Environments Used')

In [None]:
list_values = count_lists(df_origin, col='DevEnviron', delim=';')
props_df = clean_and_plot(df_origin, list_values, col='DevEnviron', title='Dev Environments Used')

In [None]:
list_values = count_lists(df, col='PlatformWorkedWith', delim=';')
props_df = clean_and_plot(df, list_values, col='PlatformWorkedWith', title='Dev Environments Used')

In [None]:
list_values = count_lists(df_origin, col='PlatformWorkedWith', delim=';')
props_df = clean_and_plot(df_origin, list_values, col='PlatformWorkedWith', title='Dev Environments Used')

In [None]:
list_values = count_lists(df, col='ImpSyn', delim=';')
props_df = clean_and_plot(df, list_values, col='ImpSyn', title='Self Rated Level of Competence')

In [None]:
list_values = count_lists(df_origin, col='ImpSyn', delim=';')
props_df = clean_and_plot(df_origin, list_values, col='ImpSyn', title='Self Rated Level of Competence')

In [None]:
list_values = count_lists(df, col='DevType', delim=';')
list_values.remove('Data scientist or machine learning specialist')
props_df = clean_and_plot(df, list_values, col='DevType', title='Other Roles')