In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set(context='notebook', style='whitegrid', font_scale=1)

In [4]:
data_folder = 'data'
file_path = os.path.join(data_folder, '2020_data.zip')
df = pd.read_csv(file_path, index_col='Respondent')

professional_coder_options = ['I am a developer by profession',
                              'I am not primarily a developer, but I write code sometimes as part of my work']
professional_coders = df['MainBranch'].isin(professional_coder_options)


employed_options = ['Independent contractor, freelancer, or self-employed',
                    'Employed full-time', 'Employed part-time']
employed = df['Employment'].isin(employed_options)

employed_coders = employed & professional_coders

print(f'Percentual selecionado: {employed_coders.mean():.2%}')

df = df[employed_coders]

df.dropna(subset=['ConvertedComp'], inplace=True)
df.head()

Percentual selecionado: 77.54%


Unnamed: 0_level_0,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,I am a developer by profession,Yes,36.0,12,Yearly,116000.0,116000.0,United States,United States dollar,USD,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;React.js;Vue.js,Flask,Just as welcome now as I felt last year,39.0,17,13
10,I am a developer by profession,Yes,22.0,14,Yearly,25000.0,32315.0,United Kingdom,Pound sterling,GBP,...,Easy,Appropriate in length,No,Mathematics or statistics,Flask;jQuery,Flask;jQuery,Somewhat more welcome now than last year,36.0,8,4
11,I am a developer by profession,Yes,23.0,13,Yearly,31000.0,40070.0,United Kingdom,Pound sterling,GBP,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Angular;Django;React.js,Angular;Angular.js;Django;React.js,Just as welcome now as I felt last year,40.0,10,2
12,I am a developer by profession,No,49.0,42,Monthly,1100.0,14268.0,Spain,European Euro,EUR,...,Easy,Appropriate in length,No,Mathematics or statistics,ASP.NET;jQuery,ASP.NET;jQuery,Just as welcome now as I felt last year,40.0,7,7
13,"I am not primarily a developer, but I write co...",Yes,53.0,14,Monthly,3000.0,38916.0,Netherlands,European Euro,EUR,...,Neither easy nor difficult,Too long,No,,,,A lot less welcome now than last year,36.0,35,20


In [9]:
df.isna().mean().mul(100).round(2).sort_values().to_frame('Missing Data (%)')

Unnamed: 0,Missing Data (%)
MainBranch,0.0
Hobbyist,0.0
CompFreq,0.0
CompTotal,0.0
ConvertedComp,0.0
Country,0.0
CurrencyDesc,0.0
CurrencySymbol,0.0
Employment,0.0
JobSat,0.15


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34717 entries, 8 to 65631
Data columns (total 60 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   MainBranch                    34717 non-null  object 
 1   Hobbyist                      34717 non-null  object 
 2   Age                           30563 non-null  float64
 3   Age1stCode                    34633 non-null  object 
 4   CompFreq                      34717 non-null  object 
 5   CompTotal                     34717 non-null  float64
 6   ConvertedComp                 34717 non-null  float64
 7   Country                       34717 non-null  object 
 8   CurrencyDesc                  34717 non-null  object 
 9   CurrencySymbol                34717 non-null  object 
 10  DatabaseDesireNextYear        26472 non-null  object 
 11  DatabaseWorkedWith            29722 non-null  object 
 12  DevType                       34367 non-null  object 
 13  E

In [29]:
stack_questions = ['NEWOffTopic', 'NEWOtherComms', 'NEWSOSites',
                   'SOAccount', 'SOComm', 'SOPartFreq',
                   'SOVisitFreq', 'WelcomeChange']

survey_questions = ['SurveyEase', 'SurveyLength']


demo_questions = ['Age', 'Country', 'Ethnicity',
                  'Gender', 'Sexuality', 'CurrencyDesc',
                  'CurrencySymbol', 'Trans']

work_questions = ['MainBranch', 'CompFreq', 'CompTotal', 'ConvertedComp',
                  'Employment', 'JobSat', 'JobFactors', 'JobSeek',
                  'NEWDevOps', 'NEWDevOpsImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
                  'NEWOnboardGood', 'NEWOvertime', 'PurchaseWhat', 'WorkWeekHrs']

competences_questions = ['DatabaseDesireNextYear', 'DatabaseWorkedWith',
                         'EdLevel', 'LanguageDesireNextYear', 'LanguageWorkedWith',
                         'MiscTechDesireNextYear', 'MiscTechWorkedWith',
                         'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith',
                         'NEWEdImpt', 'NEWLearn', 'OpSys', 'OrgSize',
                         'PlatformDesireNextYear', 'PlatformWorkedWith',
                         'UndergradMajor', 'WebframeDesireNextYear', 'WebframeWorkedWith',
                         'YearsCode', 'YearsCodePro']

other_questions = ['Hobbyist', 'Age1stCode', 'DevType',
                   'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWStuck']

In [30]:
total = 0
questions = []
questions_groups = [stack_questions, survey_questions, demo_questions, work_questions, competences_questions, other_questions]
for group in questions_groups:
    total += len(group)
    questions += group
total

60

In [31]:
[col for col in df.columns if col not in questions]

[]

In [32]:
df.shape

(34717, 60)

In [52]:
def multicategorical_dummy(df, col):
    size = df.shape[0]
    serie = df[col].dropna()
    serie = serie.str.split(';').explode()
    dummies_df = pd.get_dummies(serie).reset_index().groupby('Respondent').max()
    return dummies_df.std()#sum().div(size).mul(100).round(2)

In [53]:
multicat_questions = ['DatabaseWorkedWith', 'DevType', 'JobFactors',
                      'LanguageDesireNextYear', 'LanguageWorkedWith',
                      'MiscTechDesireNextYear', 'MiscTechWorkedWith',
                      'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 
                      'NEWJobHunt', 'NEWJobHuntResearch', 'NEWPurchaseResearch',
                      'NEWStuck', 'PlatformDesireNextYear', 'PlatformWorkedWith',
                      'Sexuality', 'WebframeDesireNextYear', 'WebframeWorkedWith']

for col in multicat_questions:
    print(col)
    print(multicategorical_dummy(df, col))
    print('#'*100, '\n')

DatabaseWorkedWith
Cassandra               0.185454
Couchbase               0.138559
DynamoDB                0.275789
Elasticsearch           0.371365
Firebase                0.335944
IBM DB2                 0.167153
MariaDB                 0.373654
Microsoft SQL Server    0.478914
MongoDB                 0.438886
MySQL                   0.499952
Oracle                  0.363256
PostgreSQL              0.489694
Redis                   0.412563
SQLite                  0.454877
dtype: float64
#################################################################################################### 

DevType
Academic researcher                              0.242331
Data or business analyst                         0.264261
Data scientist or machine learning specialist    0.265333
Database administrator                           0.320070
Designer                                         0.292500
DevOps specialist                                0.341338
Developer, QA or test                        

In [54]:
singlecat_questions = ['Country', 'CompFreq', 'Employment',
                       'JobSat', 'JobSeek', 'NEWDevOps', 'NEWDevOpsImpt',
                       'NEWEdImpt', 'NEWLearn', 'NEWOnboardGood', 'NEWPurpleLink',
                       'OpSys', 'OrgSize', 'PurchaseWhat', 'Trans', 'UndergradMajor']

for col in singlecat_questions:
    print(col)
    print(multicategorical_dummy(df, col))
    print('#'*100, '\n')

Country
Afghanistan                             0.025731
Albania                                 0.023995
Algeria                                 0.021463
Andorra                                 0.007590
Angola                                  0.007590
                                          ...   
Venezuela, Bolivarian Republic of...    0.032186
Viet Nam                                0.054651
Yemen                                   0.005367
Zambia                                  0.012000
Zimbabwe                                0.017798
Length: 159, dtype: float64
#################################################################################################### 

CompFreq
Monthly    0.493915
Weekly     0.164556
Yearly     0.497505
dtype: float64
#################################################################################################### 

Employment
Employed full-time                                      0.323035
Employed part-time                                      0.1

In [43]:
multicategorical_dummy(df, 'EdLevel')

Associate degree (A.A., A.S., etc.)                                                    3.36
Bachelor’s degree (B.A., B.S., B.Eng., etc.)                                          48.56
I never completed any formal education                                                 0.43
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)                                       24.95
Other doctoral degree (Ph.D., Ed.D., etc.)                                             3.29
Primary/elementary school                                                              0.44
Professional degree (JD, MD, etc.)                                                     1.37
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)     4.21
Some college/university study without earning a degree                                11.75
dtype: float64

In [62]:
df[['MainBranch', 'Hobbyist']].loc[:,[False, False]].join(df)

Unnamed: 0_level_0,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,I am a developer by profession,Yes,36.0,12,Yearly,116000.0,116000.0,United States,United States dollar,USD,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;React.js;Vue.js,Flask,Just as welcome now as I felt last year,39.0,17,13
10,I am a developer by profession,Yes,22.0,14,Yearly,25000.0,32315.0,United Kingdom,Pound sterling,GBP,...,Easy,Appropriate in length,No,Mathematics or statistics,Flask;jQuery,Flask;jQuery,Somewhat more welcome now than last year,36.0,8,4
11,I am a developer by profession,Yes,23.0,13,Yearly,31000.0,40070.0,United Kingdom,Pound sterling,GBP,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Angular;Django;React.js,Angular;Angular.js;Django;React.js,Just as welcome now as I felt last year,40.0,10,2
12,I am a developer by profession,No,49.0,42,Monthly,1100.0,14268.0,Spain,European Euro,EUR,...,Easy,Appropriate in length,No,Mathematics or statistics,ASP.NET;jQuery,ASP.NET;jQuery,Just as welcome now as I felt last year,40.0,7,7
13,"I am not primarily a developer, but I write co...",Yes,53.0,14,Monthly,3000.0,38916.0,Netherlands,European Euro,EUR,...,Neither easy nor difficult,Too long,No,,,,A lot less welcome now than last year,36.0,35,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65619,"I am not primarily a developer, but I write co...",Yes,,19,Monthly,30000.0,984.0,Nigeria,Nigerian naira,NGN,...,,,,,Angular,jQuery,,4.0,3,2
65625,I am a developer by profession,Yes,,17,Monthly,5500000.0,19428.0,Colombia,Colombian peso,COP,...,,,,"Computer science, computer engineering, or sof...",,,,40.0,12,5
65629,I am a developer by profession,Yes,41.0,15,Yearly,200.0,200.0,United States,United States dollar,USD,...,,,No,"Computer science, computer engineering, or sof...",React.js,React.js,Just as welcome now as I felt last year,,25,20
65630,I am a developer by profession,Yes,,17,Monthly,1000000.0,15048.0,Chile,Chilean peso,CLP,...,,,,"Information systems, information technology, o...",,,,45.0,7,3


In [48]:
singlecat_questions = ['Country', 'CompFreq', 'Employment',
                       'JobSat', 'JobSeek', 'NEWDevOps', 'NEWDevOpsImpt',
                       'NEWEdImpt', 'NEWLearn', 'NEWOnboardGood', 'NEWPurpleLink',
                       'OpSys' 'OrgSize', 'PurchaseWhat', 'Trans', 'UndergradMajor']

Bachelor’s degree (B.A., B.S., B.Eng., etc.)                                          48.56
Master’s degree (M.A., M.S., M.Eng., MBA, etc.)                                       24.95
Some college/university study without earning a degree                                11.75
Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)     4.21
Associate degree (A.A., A.S., etc.)                                                    3.36
Other doctoral degree (Ph.D., Ed.D., etc.)                                             3.29
Professional degree (JD, MD, etc.)                                                     1.37
Primary/elementary school                                                              0.44
I never completed any formal education                                                 0.43
Name: EdLevel, dtype: float64