In [1]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd


In [2]:
# Read survey results file
survey_df=pd.read_csv('survey-results-public.csv',low_memory=False)
#read the schema
schema_df=pd.read_csv('survey-results-schema.csv')


In [3]:
# View data
survey_df.head()

Unnamed: 0,Respondent,Professional,ProgramHobby,Country,University,EmploymentStatus,FormalEducation,MajorUndergrad,HomeRemote,CompanySize,...,StackOverflowMakeMoney,Gender,HighestEducationParents,Race,SurveyLong,QuestionsInteresting,QuestionsConfusing,InterestedAnswers,Salary,ExpectedSalary
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,...,Strongly disagree,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,
1,2,Student,"Yes, both",United Kingdom,"Yes, full-time",Employed part-time,Some college/university study without earning ...,Computer science or software engineering,"More than half, but not all, the time",20 to 99 employees,...,Strongly disagree,Male,A master's degree,White or of European descent,Somewhat agree,Somewhat agree,Disagree,Strongly agree,,37500.0
2,3,Professional developer,"Yes, both",United Kingdom,No,Employed full-time,Bachelor's degree,Computer science or software engineering,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A professional degree,White or of European descent,Somewhat agree,Agree,Disagree,Agree,113750.0,
3,4,Professional non-developer who sometimes write...,"Yes, both",United States,No,Employed full-time,Doctoral degree,A non-computer-focused engineering discipline,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A doctoral degree,White or of European descent,Agree,Agree,Somewhat agree,Strongly agree,,
4,5,Professional developer,"Yes, I program as a hobby",Switzerland,No,Employed full-time,Master's degree,Computer science or software engineering,Never,10 to 19 employees,...,,,,,,,,,,


In [4]:
# view number of records
survey_df.shape[0]

51392

In [5]:
# List the survey columns
list(survey_df.columns)

['Respondent',
 'Professional',
 'ProgramHobby',
 'Country',
 'University',
 'EmploymentStatus',
 'FormalEducation',
 'MajorUndergrad',
 'HomeRemote',
 'CompanySize',
 'CompanyType',
 'YearsProgram',
 'YearsCodedJob',
 'YearsCodedJobPast',
 'DeveloperType',
 'WebDeveloperType',
 'MobileDeveloperType',
 'NonDeveloperType',
 'CareerSatisfaction',
 'JobSatisfaction',
 'ExCoderReturn',
 'ExCoderNotForMe',
 'ExCoderBalance',
 'ExCoder10Years',
 'ExCoderBelonged',
 'ExCoderSkills',
 'ExCoderWillNotCode',
 'ExCoderActive',
 'PronounceGIF',
 'ProblemSolving',
 'BuildingThings',
 'LearningNewTech',
 'BoringDetails',
 'JobSecurity',
 'DiversityImportant',
 'AnnoyingUI',
 'FriendsDevelopers',
 'RightWrongWay',
 'UnderstandComputers',
 'SeriousWork',
 'InvestTimeTools',
 'WorkPayCare',
 'KinshipDevelopers',
 'ChallengeMyself',
 'CompetePeers',
 'ChangeWorld',
 'JobSeekingStatus',
 'HoursPerWeek',
 'LastNewJob',
 'AssessJobIndustry',
 'AssessJobRole',
 'AssessJobExp',
 'AssessJobDept',
 'AssessJobT

In [6]:
# View the schema df
schema_df

Unnamed: 0,Column,Question
0,Respondent,Respondent ID number
1,Professional,Which of the following best describes you?
2,ProgramHobby,Do you program as a hobby or contribute to ope...
3,Country,In which country do you currently live?
4,University,"Are you currently enrolled in a formal, degree..."
...,...,...
149,QuestionsInteresting,The questions were interesting
150,QuestionsConfusing,The questions were confusing
151,InterestedAnswers,I'm interested in learning how other developer...
152,Salary,"What is your current annual base salary, befor..."


In [7]:
# Deterine max Question length
schema_df.Question.map(lambda x: len(x)).max()

1065

In [8]:
# Create a function to look at survey questions by column
def srvy_question(col):
    #set max column width
    pd.options.display.max_colwidth = 1065
    # print question for Column
    print(schema_df[schema_df['Column'] == col].Question)


### I am going to interate through survey questions and answers until I find three interesting questions.

In [9]:
# view question
srvy_question('Professional')

1    Which of the following best describes you?
Name: Question, dtype: object


In [10]:
# View responses
survey_df['Professional'].value_counts()

Professional developer                                  36131
Student                                                  8224
Professional non-developer who sometimes writes code     5140
Used to be a professional developer                       983
None of these                                             914
Name: Professional, dtype: int64

### What are the most common languages for non-developer types and how does that compare to languages used by professional developers?

In [12]:
# See what language data looks like
survey_df.HaveWorkedLanguage.value_counts()

C#; JavaScript; SQL                                                                                 1276
JavaScript; PHP; SQL                                                                                1143
Java                                                                                                 913
JavaScript                                                                                           807
JavaScript; PHP                                                                                      662
                                                                                                    ... 
JavaScript; Matlab; Python; Ruby; SQL                                                                  1
C++; Erlang; Python; SQL                                                                               1
Assembly; C; C++; C#; R                                                                                1
Java; JavaScript; Python; R; SQL; VBA                  

In [13]:
# create function to flatten list of lists
def flat_map(x):
    flat_list = []
    for sublist in x:
        for item in sublist:
            flat_list.append(item)
    return(flat_list)


In [15]:
# Create df of languages for professional developers
prof_lang_df=pd.DataFrame(flat_map(survey_df[pd.notnull(survey_df['HaveWorkedLanguage'])].query\
                                  ('Professional in ("Professional developer", "Used to be a professional \
                                  developer")')['HaveWorkedLanguage'].str.split(';')), columns=['Language'])
# View top5 programming languages
prof_lang_df.Language.str.lstrip().value_counts().head(5)

JavaScript    18421
SQL           14836
Java          10586
C#            10129
Python         7613
Name: Language, dtype: int64

In [16]:
# Create df of languages for non-professional developers
non_prof_lang_df=pd.DataFrame(flat_map(survey_df[pd.notnull(survey_df['HaveWorkedLanguage'])].query\
                                  ('Professional not in ("Professional developer", "Used to be a professional \
                                  developer")')['HaveWorkedLanguage'].str.split(';')), columns=['Language'])
# View top5 programming languages
non_prof_lang_df.Language.str.lstrip().value_counts().head(5)

JavaScript    4454
Python        4091
Java          3938
SQL           3918
C++           2829
Name: Language, dtype: int64

*    Javascript is most commonly used among professional and non-professional developers.  SQL is number two among professional developers, but number four for non-professional developers.

In [17]:
# View question
srvy_question('HomeRemote')

8    How often do you work from home or remotely?
Name: Question, dtype: object


In [18]:
# view responses
survey_df['HomeRemote'].value_counts()

A few days each month                                      15454
Never                                                      13975
All or almost all the time (I'm full-time remote)           4905
Less than half the time, but at least one day each week     4147
More than half, but not all, the time                       1909
It's complicated                                            1849
About half the time                                         1769
Name: HomeRemote, dtype: int64

### Is there a relationship between job satisfaction and working from home?  Does this vary by gender?

In [19]:
# Get mean satisfaction for each Gender grouped by HomeRemote
male_satisfaction=survey_df.query('Gender == "Male"').groupby('HomeRemote').agg\
    ({'CareerSatisfaction':'mean'}).sort_values(['CareerSatisfaction'], ascending=False)
female_satisfaction=survey_df.query('Gender == "Female"').groupby('HomeRemote').agg\
    ({'CareerSatisfaction':'mean'}).sort_values(['CareerSatisfaction'], ascending=False)

#### Male Satisfaction

In [20]:
#Print career statifaction average rating
male_satisfaction.CareerSatisfaction.mean()

7.50651590473568

In [21]:
#Print Male statifaction
male_satisfaction

Unnamed: 0_level_0,CareerSatisfaction
HomeRemote,Unnamed: 1_level_1
All or almost all the time (I'm full-time remote),7.748168
"Less than half the time, but at least one day each week",7.677432
About half the time,7.614689
A few days each month,7.537368
"More than half, but not all, the time",7.491018
It's complicated,7.311934
Never,7.165002


#### Female Satisfaction

In [22]:
#Print career statifaction average rating
female_satisfaction.CareerSatisfaction.mean()

7.272373948402435

In [23]:
#Print Female statifaction
female_satisfaction

Unnamed: 0_level_0,CareerSatisfaction
HomeRemote,Unnamed: 1_level_1
"Less than half the time, but at least one day each week",7.661765
It's complicated,7.522727
A few days each month,7.357583
About half the time,7.2
All or almost all the time (I'm full-time remote),7.085106
Never,7.04769
"More than half, but not all, the time",7.031746


#### Differences between Male and Female are interesting
* All or almost all the time (I'm full-time remote) showed higest career satisfaction among males.
* All or almost all the time (I'm full-time remote) was on the bottom half of career satisfaction among females

In [24]:
# view question
srvy_question('ImportantBenefits')

64    When it comes to compensation and benefits, other than base salary, which of the following are most important to you?
Name: Question, dtype: object


In [25]:
# view responses
survey_df['ImportantBenefits'].value_counts()

None of these                                                                                                    653
Retirement; Vacation/days off; Health benefits; Expected work hours; Remote options                              363
Vacation/days off; Equipment; Professional development sponsorship; Expected work hours; Remote options          324
Vacation/days off; Health benefits; Equipment; Expected work hours; Remote options                               305
Vacation/days off; Health benefits; Professional development sponsorship; Expected work hours; Remote options    249
                                                                                                                ... 
Equipment; Private office; Remote options; Other                                                                   1
Retirement; Private office; Expected work hours; Remote options                                                    1
Stock options; Long-term leave; Remote options                  

### What are the most important benefits and do they vary by gender?

In [26]:
# Get counts by gender
survey_df.query('Gender in ("Male","Female")').Gender.value_counts()

Male      31589
Female     2600
Name: Gender, dtype: int64

In [27]:
# create df with important benefits selected by Men
male_imp_df=pd.DataFrame(flat_map(survey_df[pd.notnull(survey_df['ImportantBenefits'])].query\
                            ('Gender == "Male"')['ImportantBenefits'].str.split(';')), columns=['Benefits'])

In [28]:
# view top 3 benefits selected by men
male_imp_df.Benefits.str.lstrip().value_counts().head(3)

Vacation/days off    13495
Remote options       12688
Health benefits      10885
Name: Benefits, dtype: int64

In [29]:
# create df with important benefits selected by women
female_imp_df=pd.DataFrame(flat_map(survey_df[pd.notnull(survey_df['ImportantBenefits'])].query\
                            ('Gender == "Female"')['ImportantBenefits'].str.split(';')), columns=['Benefits'])

In [30]:
#view top 3 benefits selected by women
female_imp_df.Benefits.str.lstrip().value_counts().head(3)

Vacation/days off    1330
Remote options       1121
Health benefits      1098
Name: Benefits, dtype: int64

* No difference between which benefits important to men compared to what is important to women.