### KAGGLE DS & ML SURVEY - 2020

### PROCESSING THE DATA


In [1]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import nxviz as nv

pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)
pd.options.display.max_seq_items = 2000

In [2]:
import matplotlib
matplotlib.colors
matplotlib.colors.rgb_to_hsv
matplotlib.colors.to_rgba
matplotlib.figure.Figure.get_size_inches
matplotlib.figure.Figure.subplots_adjust
matplotlib.axes.Axes.text
matplotlib.axes.Axes.hlines

<function matplotlib.axes._axes.Axes.hlines(self, y, xmin, xmax, colors=None, linestyles='solid', label='', *, data=None, **kwargs)>

### Kaggle Survey

Kaggle launches a Data Science and Machine Learning (DS & ML) Survey every year as a way to learn about the DS field, with questions on the user demographics (age, gender, education level, working status), their data science and machine learning knowledge and experience, and methods and tools they use or would like to get familiar with. The survey was distributed to the entire Kaggle community through the Kaggle (opted-in) email list, and promoted on the Kaggle website and Kaggle Twitter channel. Since it only targets the Kaggle community, everyone answering has some level of involvement with Kaggle. However, there is much diversity among the Kaggle community.

We will use the survey to model and predict salaries for this population. In this notebook we identify a subset of survey questions that could be relevant to predict salaries, and we transform survey answers so they will be easier to process. Multiple answer questions are transformed into a set of indicator variables for each of the possible answers. Answers that are categorical, but numeric in nature (e.g. age group) are transformed into continuous variables using range midpoints. Finally, we select surveys from employed, non-student respondents, responding to the salary question.


### Import and Clean Data

In [3]:
## Read the raw data
datapath='C:\\Users\\l_gas\\Documents\\Data\\Kaggle ML and DS Survey\\2020\\original_data\\'
survey_data=datapath+"kaggle_survey_2020_responses.csv"
survey = pd.read_csv(survey_data,header=None,skiprows=2)

# create a list with all column names (all survey questions, with multiple columns for multicategory questions)
col_names = list(pd.read_csv(survey_data,nrows=1,header=None).values)[0]
survey.columns = col_names
survey['Id']=survey.index


In [4]:
# Transform the age group variable (use midpoints)
age_b = pd.DataFrame({'Q1_lb_age': [], 'Q1_ub_age': []})
aa = survey.Q1.str.lstrip('+')
age_b[['Q1_lb_age','Q1_ub_age']] = aa.str.split('-',expand=True)
age_lev = pd.Series(pd.to_numeric(age_b.Q1_ub_age)+(pd.to_numeric(age_b.Q1_ub_age) - pd.to_numeric(age_b.Q1_ub_age))/2,name='age_lev')

print("Age Levels - Range Mid points")
print(age_lev.value_counts())


Age Levels - Range Mid points
29.0    4011
24.0    3786
21.0    3469
34.0    2811
39.0    1991
44.0    1397
49.0     988
54.0     698
59.0     411
69.0     398
Name: age_lev, dtype: int64


In [5]:
# Education - set as ordered categories
educ = survey.Q4.copy()
educ[survey.Q4=='No formal education past high school'] = 1
educ[survey.Q4=="Bachelor’s degree"] = 3
educ[survey.Q4=="Master’s degree"] = 4
educ[survey.Q4=="Doctoral degree"] = 5
educ[survey.Q4=="Professional degree"] = 5
educ[survey.Q4=="Some college/university study without earning a bachelor’s degree"] = 2
educ[survey.Q4=="I prefer not to answer"] = None
educ =educ.rename('educ')
print("Education Levels: 1-HS or less 2=Some College 3=BA 4=MA 5=PhD")
print(educ.value_counts())
print(type(educ))
educ.describe()


Education Levels: 1-HS or less 2=Some College 3=BA 4=MA 5=PhD
4    7859
3    6978
5    3001
2    1092
1     240
Name: educ, dtype: int64
<class 'pandas.core.series.Series'>


count     19170
unique        5
top           4
freq       7859
Name: educ, dtype: int64

In [6]:
# Programming Experience - set as ordered category
prog_exp = survey.Q6.copy()
prog_exp[survey.Q6=='I have never written code'] = 0
prog_exp[survey.Q6=='< 1 years'] = 0.5
prog_exp[survey.Q6=='1-2 years'] = 1.5
prog_exp[survey.Q6=='2-3 years'] = 2.5
prog_exp[survey.Q6=='3-5 years'] = 4
prog_exp[survey.Q6=='5-10 years'] = 7.5
prog_exp[survey.Q6=='10-20 years'] = 15
prog_exp[survey.Q6=='20+ years'] = 25
prog_exp = prog_exp.rename('prog_exp')
print("Programming Experience (Years) - Range Mid Points")
print(prog_exp.value_counts())
prog_exp.describe()

Programming Experience (Years) - Range Mid Points
4.0     4546
1.5     4505
0.5     3313
7.5     2552
15.0    1751
25.0    1329
0.0     1124
Name: prog_exp, dtype: int64


count     19120.0
unique        7.0
top           4.0
freq       4546.0
Name: prog_exp, dtype: float64

In [7]:
## Employer size - transform number of employees to continous using midpoints
empl_size = survey.Q20.copy()
empl_size[survey.Q20=='0-49 employees'] = 25
empl_size[survey.Q20=='50-249 employees'] = 150
empl_size[survey.Q20=='250-999 employees'] = 625
empl_size[survey.Q20=='1000-9,999 employees'] = 5500
empl_size[survey.Q20=='10,000 or more employees'] = 15000 
empl_size = empl_size.rename('empl_size')
print("Employer Size (number of employees) - Range Mid Points")
print(empl_size.value_counts())
empl_size.describe()

Employer Size (number of employees) - Range Mid Points
25       4208
15000    2238
5500     1934
150      1671
625      1352
Name: empl_size, dtype: int64


count     11403
unique        5
top          25
freq       4208
Name: empl_size, dtype: int64

In [8]:
## Transform multi-answer columns into indicator variables
def colind(var,column_names, title):
    """
    This function extract from the survey file all columns corresponding to a question, and re-codes to 
    get indicator columns.
    """
    _cnames = [n for n in survey.columns if str(n).find(var)!=-1]
    _selected = survey[_cnames].copy()
    for col in _cnames:
        _selected[col][_selected[col].isna()]=0
        _selected[col][_selected[col]!=0]=1
    _selected.columns = column_names
    print(title)
    print(_selected.describe())
    type(_selected)
    return _selected 
    


In [9]:
# Question 10 - Which hosted notebook products do you use on a regular basis?
nbcols = ['Kaggle','Colab','Azure','Paperspace_Gradient','Binder_JupyterHub','Code_Ocean','IBMWatson_Studio','Amazon_SagenmakerStudio','Amazon_EMR','Google_Cloud_AIPlatform','Google_Cloud_Datalab','Databricks','No_notebook','Other_notebook']
notebooks = colind('Q10', nbcols, "Notebooks Used Regularly")

# Question 12 - What specialized hardward do you use on a regular basis?
hcols = ['GPUs','TPUs','None','Other']
hardware = colind('Q12', hcols, "Specialized Hardware Used Regularly")

# Question 16 - Which ML frameworks do you use on a regular basis?
mlfcols = ['Scikit-learn','TensorFlow','Keras','Pytorch','Fast.ai','MXNet', 'Xgboost','LightGBM', 'CatBoot','Prophet', 'H2O_3','Caret','Tidymodels','JAX','No_MLFrame','Other_MLFrame']
mlframe = colind('Q16', mlfcols, "ML Frameworks Used Regularly")

# Question 17 - Which ML algorithm do you use on a regular basis?
mlacols = ['Linear Logistic Reg','Decision Trees Random Forest','Gradient Boosting Machines', 'Bayesian Approaches', 'Evolucionary Aproaches','Dense Neural Networks','Convolutional Neural Networks','Generative Adversarial Networks','Recurrent Neural Networks','Transformer Networks', 'No_MLAlg', 'Other_MLAlg']
mlalgor = colind('Q17', mlacols, "ML Algorithms Used Regularly")

# Question 18 - Which categories of computer vision methods do you use on a regular basis?
cvmcols = ['General Purpose Image/Video Tools','Image Segmentation Methods','Object Detection Methods', 'Image_Classification' , 'Generative Networks', 'No_CompVMeth', 'Other_CompVMeth']
cvisionm = colind('Q18', cvmcols, "Computer Vision Methods Used Regularly")

# Question 19 - Which NLP method do you use on a regular basis?
nlpmcols = ['Word Embeddings/vectors','Encoder-decoder models','Contextualized Embeddings','Transformer Language Models', 'No_NLPMeth', 'Other_NLPMeth']
nlpm = colind('Q19', nlpmcols, "NLP Methods Used Regularly")


Notebooks Used Regularly
        Kaggle  Colab  Azure  Paperspace_Gradient  Binder_JupyterHub  \
count    20036  20036  20036                20036              20036   
unique       2      2      2                    2                  2   
top          0      0      0                    0                  0   
freq     14044  13707  19179                19856              17964   

        Code_Ocean  IBMWatson_Studio  Amazon_SagenmakerStudio  Amazon_EMR  \
count        20036             20036                    20036       20036   
unique           2                 2                        2           2   
top              0                 0                        0           0   
freq         19931             19190                    19539       19791   

        Google_Cloud_AIPlatform  Google_Cloud_Datalab  Databricks  \
count                     20036                 20036       20036   
unique                        2                     2           2   
top                  

In [10]:
# Question 8 - What languages do you use on a regular basis?
plcols =['PL_Python','PL_R','PL_SQL','PL_C','PL_C++','PL_Java','PL_Javascript','PL_Julia','PL_Swift','PL_Bash','PL_MATLAB','PL_None','PL_Other']
proglan = colind('Q7', plcols, "Programming Language Used Regularly")


Programming Language Used Regularly
        PL_Python   PL_R  PL_SQL   PL_C  PL_C++  PL_Java  PL_Javascript  \
count       20036  20036   20036  20036   20036    20036          20036   
unique          2      2       2      2       2        2              2   
top             1      0       0      0       0        0              0   
freq        15530  15759   12501  16721   16209    16669          17041   

        PL_Julia  PL_Swift  PL_Bash  PL_MATLAB  PL_None  PL_Other  
count      20036     20036    20036      20036    20036     20036  
unique         2         2        2          2        2         2  
top            0         0        0          0        0         0  
freq       19774     19838    18260      17819    19830     18091  


In [11]:
# Number of programming languages
q7cols = [n for n in survey.columns if str(n).find('Q7')!=-1]
n_lang = pd.Series([sum(row[q7cols].notna()) for i, row in survey.iterrows() if not('None' in row[q7cols])])
n_lang[survey.Q7_Part_12=='None'] = 0
n_lang = n_lang.rename('Language Number')

print("Number of Programming Languages Used")
print(n_lang.value_counts())

Number of Programming Languages Used
2     5112
1     4252
3     4103
0     2334
4     2165
5     1165
6      557
7      220
8       86
9       26
10       9
11       4
12       3
Name: Language Number, dtype: int64


In [12]:
# Gender
gender = survey.Q2.copy()
gender[survey.Q2=='Man'] = 'Male'
gender[survey.Q2=='Woman'] = 'Woman'
gender[survey.Q2=='Nonbinary']= 'Gender_Other'
gender[survey.Q2=='Prefer not to say'] = 'Gender_Other'
gender[survey.Q2=='Prefer to self-describe'] = 'Gender_Other'
gender = gender.rename('gender')

print("Gender")
print(gender.value_counts())


Gender
Male            15789
Woman            3878
Gender_Other      369
Name: gender, dtype: int64


In [13]:
# Use of data visialization libraries (python, R, Javascript)
xv = [n for n in survey.columns if str(n).find('Q14')!=-1]
vizlib = pd.Series([any(row[xv].notna()) and not('None' in row[xv]) for i, row in survey.iterrows()])
vizlib = vizlib.rename('vizlib')
print("Use of Data Visualizatio Libraries")
print(vizlib.value_counts())


Use of Data Visualizatio Libraries
True     16478
False     3558
Name: vizlib, dtype: int64


In [14]:
# Machine Learning methods: 2 or more years of experience, some experience or none
mlexp = pd.Series(survey.Q15.copy())
mlexp[survey.Q15=='I do not use machine learning methods'] = 'No ML Experience'
mlexp[survey.Q15=='Under 1 year'] = 'Some ML Experience'
mlexp[survey.Q15=='1-2 years'] = 'Some ML Experience'
mlexp[survey.Q15=='2-3 years'] = 'More than 2yrs ML Exp'
mlexp[survey.Q15=='3-4 years'] = 'More than 2yrs ML Exp'
mlexp[survey.Q15=='4-5 years'] = 'More than 2yrs ML Exp'
mlexp[survey.Q15=='5-10 years'] = 'More than 2yrs ML Exp'
mlexp[survey.Q15=='10-20 years'] = 'More than 2yrs ML Exp'
mlexp[survey.Q15=='20 or more years'] = 'More than 2yrs ML Exp'
mlexp = mlexp.rename('mlexp')

print("Machine Learning Algorithm Experience")
print(mlexp.value_counts())


Machine Learning Algorithm Experience
Some ML Experience       9771
More than 2yrs ML Exp    4528
No ML Experience         2075
Name: mlexp, dtype: int64


In [15]:
# Machine Learning Experience - ordered categories
mlexp_or = survey.Q15.copy()
mlexp_or[survey.Q15=='I do not use machine learning methods'] = 0
mlexp_or[survey.Q15=='Under 1 year'] = 0.5
mlexp_or[survey.Q15=='1-2 years'] = 1.5
mlexp_or[survey.Q15=='2-3 years'] = 2.5
mlexp_or[survey.Q15=='3-4 years'] = 3.5
mlexp_or[survey.Q15=='4-5 years'] = 4.5
mlexp_or[survey.Q15=='5-10 years'] = 7.5
mlexp_or[survey.Q15=='10-20 years'] = 15
mlexp_or[survey.Q15=='20 or more years'] = 25
mlexp_or = mlexp_or.rename('mlexp_or')
print(mlexp_or.value_counts())

0.5     6312
1.5     3459
0.0     2075
2.5     1631
3.5      893
7.5      801
4.5      784
15.0     244
25.0     175
Name: mlexp_or, dtype: int64


In [16]:
# Cloud computer use

xcc = [n for n in survey.columns if str(n).find('Q26_A')!=-1]
cloudcomp = pd.Series([any(row[xcc].notna()) and not('None' in row[xcc]) for i, row in survey.iterrows()])
cloudcomp = cloudcomp.rename('cloudcomp')
print("Use of Cloud Computing")
print(cloudcomp.value_counts())


Use of Cloud Computing
False    13140
True      6896
Name: cloudcomp, dtype: int64


In [17]:
## Transform compensation reanges into a continous variable
wage_df = survey.Q24.str.lstrip('> $').str.split('-',expand=True)
wage_df = wage_df.replace(',','',regex=True).apply(pd.to_numeric, errors='coerce')
wage_df.columns = ['Q24_lb_wage','Q24_ub_wage']
wage_df['wage'] = wage_df['Q24_lb_wage']+(wage_df['Q24_ub_wage']-wage_df['Q24_lb_wage'])/2
wage_df.wage[wage_df['Q24_lb_wage']==500000] = 600000
print(wage_df.head())


   Q24_lb_wage  Q24_ub_wage      wage
0          NaN          NaN       NaN
1     100000.0     124999.0  112499.5
2      15000.0      19999.0   17499.5
3     125000.0     149999.0  137499.5
4          NaN          NaN       NaN


In [18]:
## Classify countries by medium wage quartiles

incpath='C:\\Users\\l_gas\\Documents\\Data\\Countries Median Income\\'    
country_inc = pd.read_json(incpath+"data.json")
country_inc = country_inc[country_inc.medianHouseholdIncome.notna()]
medinc=country_inc[["country","medianHouseholdIncome"]].sort_values(by="medianHouseholdIncome")

#sns.relplot(x="country",y="medianHouseholdwage",data=medinc)
#plt.show()

medinc=pd.concat([medinc,pd.qcut(medinc.medianHouseholdIncome,4,labels=["Low Income","Medium Low Inc", "Medium High Inc", "High Income"]).rename("CountryIncLev")],axis=1)
print(medinc.head())



        country  medianHouseholdIncome CountryIncLev
130        Togo                  571.0    Low Income
129     Burundi                  673.0    Low Income
128     Liberia                  781.0    Low Income
127  Madagascar                 1013.0    Low Income
126      Rwanda                 1101.0    Low Income


In [19]:
# CREATE FILE WITH SELECTED VARIABLES - INCLUDE ORDER CATEGORIES - AND SELECTED OBSERVATIONS
# Regular categories

## Select categorical variables to create dummies
rc = ['Q3','Q5','Q11','Q38','Q24']
categories = survey[rc].copy()
categories.Q3[categories.Q3=='Other'] = 'Other Countries'
categories.Q3[categories.Q3=='United States of America'] = 'United States'
categories.Q3[categories.Q3=='Iran, Islamic Republic of...'] = 'Iran'
categories.Q3[categories.Q3=='United Kingdom of Great Britain and Northern Ireland'] = 'United Kingdom'

## Country will be used to get 4 different wage levels based on median household wage in the country
categories = categories.merge(medinc[['country','CountryIncLev']],how='left', left_on='Q3', right_on='country')

## Top 10 countries with surveys will translate into an indicator column
#top10 = ["India","United States", "Brazil", "Japan","Russia", "United Kingdom","Nigeria","China","Germany","Turkey"]
top10 = ["India", "United States", "Brazil", "Japan", "Russia", "United Kingdom", "Germany", "Nigeria", "Spain", "Canada"]
categories['TopCountries'] = "Other"
for (i,c) in categories.country.items():
    if c in top10:
        categories.loc[i,'TopCountries'] = c

# Final list of variables to create dummies and corresponding prefix
rcnew = ['CountryIncLev','TopCountries','Q5','Q11','Q38']
rc_n = ['Country','','Role','Platform','MainTool']

xmc = pd.get_dummies(categories,columns=rcnew,prefix=rc_n)
xmc = xmc.drop(['Q3','country'],axis=1)
# Transformed regular categories
xmc2 = pd.get_dummies(gender,columns=['gender'])
xmc3 = pd.get_dummies(mlexp,columns=['mlexp'])

categ2 = [age_lev, educ, prog_exp, empl_size, notebooks, hardware, mlframe, mlalgor, cvisionm, nlpm, proglan, n_lang, mlexp_or, vizlib, cloudcomp, xmc, xmc2, xmc3,wage_df]

survey_features = pd.concat(categ2,axis=1)
print(survey_features.shape)

# Select Only Eployed Individuals.

print("Number of Unemployed Respondents: %0.0f" % survey_features["Role_Currently not employed"].sum())
print("Number of Student Respondents: %0.0f " % survey_features["Role_Student"].sum())
features2 = survey_features.loc[(survey_features["Role_Currently not employed"]==0) & (survey_features["Role_Student"]==0)]
print("Matrix size after dropping unemployed and students",features2.shape)
# Drop Cases with missing income or other covariates. NEED TO EXPLORE IF THERE ARE COVARIATES WITH LARGE NUMBER OF MISSINGS
print("Number of observations with missing income: ", features2.wage.isnull().sum())
# drop cases with missing income
features2 = features2[features2.wage.notna()]
print("Shape of dataframe with selected observations: ",features2.shape)


features2.to_json("./DS_ML_Survey_feature_sel_3.json", compression=None)



(20036, 129)
Number of Unemployed Respondents: 1652
Number of Student Respondents: 5171 
Matrix size after dropping unemployed and students (13213, 129)
Number of observations with missing income:  2484
Shape of dataframe with selected observations:  (10729, 129)
