In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
data = pd.read_csv('/Users/cheng-chunchao/Desktop/data-x-dataset.csv')

# Cleaned questions list could be checked on google docs: question_detail

In [4]:
# seperate questions into a new dataframe
question = data.iloc[0,:]
data = data.drop(0,axis=0)

In [5]:
data = data[data['Q5'].notna()] # delete those who didn't fill title - having most NaN

## Clean NaN value and change type into numerical or categorical

In [6]:
# pd.unique(data['Q1'])
data['Q1'] = data['Q1'].astype('category')

# pd.unique(data['Q2'])
data['Q2'] = data['Q2'].astype('category')

# pd.unique(data['Q3'])
data['Q3'] = data['Q3'].astype('category')

# Replace NaN value in education into 'I prefer not to answer'
data['Q4'].fillna('I prefer not to answer',inplace=True)
data['Q4'] = data['Q4'].astype('category')

# Replace NaN value, students are unemployed
data['Q6'].fillna('Not employed',inplace=True) 
data['Q6'] = data['Q6'].astype('category')

# Rank to what degree do people incorporate machine learning methods into their work
data['Q8'].fillna(0,inplace=True)
data['Q8'].replace(
    ['We recently started using ML methods (i.e., models in production for less than 2 years)',
     'I do not know',
     'We use ML methods for generating insights (but do not put working models into production)',
     'We have well established ML methods (i.e., models in production for more than 2 years)',
     'We are exploring ML methods (and may one day put a model into production)',
     'No (we do not use ML methods)'],[3,0,2,4,1,0],inplace=True)

# Replace plain text description with Yes/No
data['Q9_Part_1'].replace(['Analyze and understand data to influence product or business decisions'],['True'],inplace=True)
data['Q9_Part_2'].replace(['Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data'],['True'],inplace=True)
data['Q9_Part_3'].replace(['Build prototypes to explore applying machine learning to new areas'],['True'],inplace=True)
data['Q9_Part_4'].replace(['Build and/or run a machine learning service that operationally improves my product or workflows'],['True'],inplace=True)
data['Q9_Part_5'].replace(['Experimentation and iteration to improve existing ML models'],['True'],inplace=True)
data['Q9_Part_6'].replace(['Do research that advances the state of the art of machine learning'],['True'],inplace=True)
data['Q9_Part_7'].replace(['None of these activities are an important part of my role at work'],['True'],inplace=True)
del data['Q9_Part_8']

data['Q9_Part_1'].fillna('False',inplace=True)
data['Q9_Part_2'].fillna('False',inplace=True)
data['Q9_Part_3'].fillna('False',inplace=True)
data['Q9_Part_4'].fillna('False',inplace=True)
data['Q9_Part_5'].fillna('False',inplace=True)
data['Q9_Part_6'].fillna('False',inplace=True)
data['Q9_Part_7'].fillna('False',inplace=True)

data['Q9_Part_1'] = data['Q9_Part_1'].astype('category')
data['Q9_Part_2'] = data['Q9_Part_2'].astype('category')
data['Q9_Part_3'] = data['Q9_Part_3'].astype('category')
data['Q9_Part_4'] = data['Q9_Part_4'].astype('category')
data['Q9_Part_5'] = data['Q9_Part_5'].astype('category')
data['Q9_Part_6'] = data['Q9_Part_6'].astype('category')
data['Q9_Part_7'] = data['Q9_Part_7'].astype('category')


### Assign 0 salary to students & not employed, and fill NaN value by calculating guess matrix of age & title

In [7]:
# Assign $0 salary to students & not employed, and fill NaN value by calculating guess matrix of age & title
data['Q10'][data['Q5']=='Student'].fillna('0-999',inplace=True)
data['Q10'][data['Q5']=='Not employed'].fillna('0-999',inplace=True)
data['Q10'].replace(['$0-999'],['0-999'],inplace=True)
data['Q10'] = data['Q10'].astype('category')

guess_matrix_salary = np.empty((11,12),dtype=object)
test=[]
for i in range(0, 11):
    for j in range(0, 12):
        guess_df = data[(data['Q1'] == pd.unique(data['Q1'])[i])&(data['Q5'] == pd.unique(data['Q5'])[j])]['Q10'].dropna()
        
        try:
            salary_guess = guess_df.mode().iloc[0]
        except IndexError:
            salary_guess = '0-999'
            
        guess_matrix_salary[i,j] = salary_guess
    

for i in range(0, 11): 
    for j in range(0, 12): 
        data.loc[(data['Q10'].isnull()) & (data['Q1'] == pd.unique(data['Q1'])[i]) & (data['Q5'] == pd.unique(data['Q5'])[j]),'Q10'] = guess_matrix_salary[i,j]

data['Q10'] = data['Q10'].astype('category')

### Money you spent on machine learning and/or cloud computing products 

In [8]:
# Money you spent on machine learning and/or cloud computing products 
data['Q11'].fillna('$0 (USD)',inplace=True)
data['Q11'] = data['Q11'].astype('category')

### On which platforms have you begun or completed data science courses? (multiple choices)

In [9]:
# On which platforms have you begun or completed data science courses? (multiple choices)
data['Q13_Part_1'].fillna(0,inplace=True)
data['Q13_Part_1'].replace('Udacity',1)
data['Q13_Part_1'] = data['Q13_Part_1'].astype('category')

data['Q13_Part_2'].fillna(0,inplace=True)
data['Q13_Part_2'].replace('Coursera',1)
data['Q13_Part_2'] = data['Q13_Part_2'].astype('category')

data['Q13_Part_3'].fillna(0,inplace=True)
data['Q13_Part_3'].replace('edX',1)
data['Q13_Part_3'] = data['Q13_Part_3'].astype('category')

data['Q13_Part_4'].fillna(0,inplace=True)
data['Q13_Part_4'].replace('DataCamp',1)
data['Q13_Part_4'] = data['Q13_Part_4'].astype('category')

data['Q13_Part_5'].fillna(0,inplace=True)
data['Q13_Part_5'].replace('DataQuest',1)
data['Q13_Part_5'] = data['Q13_Part_5'].astype('category')

data['Q13_Part_6'].fillna(0,inplace=True)
data['Q13_Part_6'].replace('Kaggle Courses (i.e. Kaggle Learn)',1)
data['Q13_Part_6'] = data['Q13_Part_6'].astype('category')

data['Q13_Part_7'].fillna(0,inplace=True)
data['Q13_Part_7'].replace('Fast.ai',1)
data['Q13_Part_7'] = data['Q13_Part_7'].astype('category')

data['Q13_Part_8'].fillna(0,inplace=True)
data['Q13_Part_8'].replace('Udemy',1)
data['Q13_Part_8'] = data['Q13_Part_8'].astype('category')

data['Q13_Part_9'].fillna(0,inplace=True)
data['Q13_Part_9'].replace('LinkedIn Learning',1)
data['Q13_Part_9'] = data['Q13_Part_9'].astype('category')

data['Q13_Part_10'].fillna(0,inplace=True)
data['Q13_Part_10'].replace('University Courses (resulting in a university degree)',1)
data['Q13_Part_10'] = data['Q13_Part_10'].astype('category')

del data['Q13_Part_11']
del data['Q13_Part_12']

1       0
2       0
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q13_Part_1, Length: 3454, dtype: int64

1       0
2       0
3       1
4       1
5       1
       ..
3529    0
3530    0
3531    1
3533    0
3534    0
Name: Q13_Part_2, Length: 3454, dtype: int64

1       0
2       0
3       0
4       1
5       0
       ..
3529    0
3530    0
3531    1
3533    0
3534    0
Name: Q13_Part_3, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q13_Part_4, Length: 3454, dtype: int64

1       0
2       0
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q13_Part_5, Length: 3454, dtype: int64

1       0
2       1
3       1
4       0
5       1
       ..
3529    1
3530    0
3531    0
3533    0
3534    0
Name: Q13_Part_6, Length: 3454, dtype: int64

1       0
2       0
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q13_Part_7, Length: 3454, dtype: int64

1       1
2       0
3       1
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q13_Part_8, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q13_Part_9, Length: 3454, dtype: int64

1       1
2       1
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    1
3533    0
3534    0
Name: Q13_Part_10, Length: 3454, dtype: int64

### What is the primary tool that you use at work or school to analyze data?

In [10]:
# What is the primary tool that you use at work or school to analyze data?
del data['Q14_Part_1_TEXT']
del data['Q14_Part_2_TEXT']
del data['Q14_Part_3_TEXT']
del data['Q14_Part_4_TEXT']
del data['Q14_Part_5_TEXT']
del data['Q14_OTHER_TEXT']

data['Q14'].fillna('Other',inplace=True)
data['Q14'] = data['Q14'].astype('category')

### How long have you been writing code to analyze data?

In [11]:
# How long have you been writing code to analyze data?
data['Q15'].fillna('< 1 years',inplace=True)
data['Q15'] = data['Q15'].astype('category')

### Which of the following integrated development environments (IDE's) do you use on a regular basis?

In [12]:
# Which of the following integrated development environments (IDE's) do you use on a regular basis? 
data['Q16_Part_1'].fillna(0,inplace=True)
data['Q16_Part_1'].replace('Jupyter (JupyterLab, Jupyter Notebooks, etc) ',1)
data['Q16_Part_1'] = data['Q16_Part_1'].astype('category')

data['Q16_Part_2'].fillna(0,inplace=True)
data['Q16_Part_2'].replace(' RStudio ',1)
data['Q16_Part_2'] = data['Q16_Part_2'].astype('category')

data['Q16_Part_3'].fillna(0,inplace=True)
data['Q16_Part_3'].replace(' PyCharm ',1)
data['Q16_Part_3'] = data['Q16_Part_3'].astype('category')

data['Q16_Part_4'].fillna(0,inplace=True)
data['Q16_Part_4'].replace(' Atom ',1)
data['Q16_Part_4'] = data['Q16_Part_4'].astype('category')

data['Q16_Part_5'].fillna(0,inplace=True)
data['Q16_Part_5'].replace(' MATLAB ',1)
data['Q16_Part_5'] = data['Q16_Part_5'].astype('category')

data['Q16_Part_6'].fillna(0,inplace=True)
data['Q16_Part_6'].replace(' Visual Studio / Visual Studio Code ',1)
data['Q16_Part_6'] = data['Q16_Part_6'].astype('category')

data['Q16_Part_7'].fillna(0,inplace=True)
data['Q16_Part_7'].replace('  Spyder  ',1)
data['Q16_Part_7'] = data['Q16_Part_7'].astype('category')

data['Q16_Part_8'].fillna(0,inplace=True)
data['Q16_Part_8'].replace('  Vim / Emacs  ',1)
data['Q16_Part_8'] = data['Q16_Part_8'].astype('category')

data['Q16_Part_9'].fillna(0,inplace=True)
data['Q16_Part_9'].replace('  Notepad++  ',1)
data['Q16_Part_9'] = data['Q16_Part_9'].astype('category')

data['Q16_Part_10'].fillna(0,inplace=True)
data['Q16_Part_10'].replace('  Sublime Text  ',1)
data['Q16_Part_10'] = data['Q16_Part_10'].astype('category')

1       1
2       1
3       1
4       1
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_1, Length: 3454, dtype: int64

1       0
2       0
3       0
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_2, Length: 3454, dtype: int64

1       0
2       1
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_3, Length: 3454, dtype: int64

1       0
2       1
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_4, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_5, Length: 3454, dtype: int64

1       0
2       0
3       0
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_6, Length: 3454, dtype: int64

1       1
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_7, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_8, Length: 3454, dtype: int64

1       0
2       0
3       1
4       1
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_9, Length: 3454, dtype: int64

1       0
2       0
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q16_Part_10, Length: 3454, dtype: int64

### What programming languages do you use on a regular basis? 

In [13]:
# What programming languages do you use on a regular basis? 
data['Q18_Part_1'].fillna(0,inplace=True)
data['Q18_Part_1'].replace('Python',1)
data['Q18_Part_1'] = data['Q18_Part_1'].astype('category')

data['Q18_Part_2'].fillna(0,inplace=True)
data['Q18_Part_2'].replace('R',1)
data['Q18_Part_2'] = data['Q18_Part_2'].astype('category')

data['Q18_Part_3'].fillna(0,inplace=True)
data['Q18_Part_3'].replace('SQL',1)
data['Q18_Part_3'] = data['Q18_Part_3'].astype('category')

data['Q18_Part_4'].fillna(0,inplace=True)
data['Q18_Part_4'].replace('C',1)
data['Q18_Part_4'] = data['Q18_Part_4'].astype('category')

data['Q18_Part_5'].fillna(0,inplace=True)
data['Q18_Part_5'].replace('C++',1)
data['Q18_Part_5'] = data['Q18_Part_5'].astype('category')

data['Q18_Part_6'].fillna(0,inplace=True)
data['Q18_Part_6'].replace('Java',1)
data['Q18_Part_6'] = data['Q18_Part_6'].astype('category')

data['Q18_Part_7'].fillna(0,inplace=True)
data['Q18_Part_7'].replace('Javascript',1)
data['Q18_Part_7'] = data['Q18_Part_7'].astype('category')

data['Q18_Part_8'].fillna(0,inplace=True)
data['Q18_Part_8'].replace('TypeScript',1)
data['Q18_Part_8'] = data['Q18_Part_8'].astype('category')

data['Q18_Part_9'].fillna(0,inplace=True)
data['Q18_Part_9'].replace('Bash',1)
data['Q18_Part_9'] = data['Q18_Part_9'].astype('category')

data['Q18_Part_10'].fillna(0,inplace=True)
data['Q18_Part_10'].replace('MATLAB',1)
data['Q18_Part_10'] = data['Q18_Part_10'].astype('category')

del data['Q19']

1       1
2       1
3       1
4       1
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_1, Length: 3454, dtype: int64

1       0
2       0
3       0
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_2, Length: 3454, dtype: int64

1       0
2       0
3       0
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_3, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_4, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_5, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_6, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_7, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_8, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_9, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q18_Part_10, Length: 3454, dtype: int64

### What data visualization libraries or tools do you use on a regular basis?  

In [14]:
# What data visualization libraries or tools do you use on a regular basis?  
data['Q20_Part_1'].fillna(0,inplace=True)
data['Q20_Part_1'].replace(' Ggplot / ggplot2 ',1)
data['Q20_Part_1'] = data['Q20_Part_1'].astype('category')

data['Q20_Part_2'].fillna(0,inplace=True)
data['Q20_Part_2'].replace(' Matplotlib ',1)
data['Q20_Part_2'] = data['Q20_Part_2'].astype('category')

data['Q20_Part_3'].fillna(0,inplace=True)
data['Q20_Part_3'].replace(' Altair ',1)
data['Q20_Part_3'] = data['Q20_Part_3'].astype('category')

data['Q20_Part_4'].fillna(0,inplace=True)
data['Q20_Part_4'].replace(' Shiny ',1)
data['Q20_Part_4'] = data['Q20_Part_4'].astype('category')

data['Q20_Part_5'].fillna(0,inplace=True)
data['Q20_Part_5'].replace(' D3.js ',1)
data['Q20_Part_5'] = data['Q20_Part_5'].astype('category')

data['Q20_Part_6'].fillna(0,inplace=True)
data['Q20_Part_6'].replace(' Plotly / Plotly Express ',1)
data['Q20_Part_6'] = data['Q20_Part_6'].astype('category')

data['Q20_Part_7'].fillna(0,inplace=True)
data['Q20_Part_7'].replace(' Bokeh ',1)
data['Q20_Part_7'] = data['Q20_Part_7'].astype('category')

data['Q20_Part_8'].fillna(0,inplace=True)
data['Q20_Part_8'].replace(' Seaborn ',1)
data['Q20_Part_8'] = data['Q20_Part_8'].astype('category')

data['Q20_Part_9'].fillna(0,inplace=True)
data['Q20_Part_9'].replace(' Geoplotlib ',1)
data['Q20_Part_9'] = data['Q20_Part_9'].astype('category')

data['Q20_Part_10'].fillna(0,inplace=True)
data['Q20_Part_10'].replace(' Leaflet / Folium ',1)
data['Q20_Part_10'] = data['Q20_Part_10'].astype('category')

del data['Q21_Part_1']

1       0
2       0
3       0
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_1, Length: 3454, dtype: int64

1       1
2       1
3       1
4       1
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_2, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_3, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_4, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_5, Length: 3454, dtype: int64

1       1
2       0
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_6, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_7, Length: 3454, dtype: int64

1       0
2       1
3       1
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_8, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_9, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q20_Part_10, Length: 3454, dtype: int64

### For how many years have you used machine learning methods?

In [15]:
# For how many years have you used machine learning methods?
data['Q23'].fillna('< 1 years',inplace=True)
data['Q23'] = data['Q23'].astype('category')

### Which of the following ML algorithms do you use on a regular basis?

In [16]:
# Which of the following ML algorithms do you use on a regular basis?
data['Q24_Part_1'].fillna(0,inplace=True)
data['Q24_Part_1'].replace('Linear or Logistic Regression',1)
data['Q24_Part_1'] = data['Q24_Part_1'].astype('category')

data['Q24_Part_2'].fillna(0,inplace=True)
data['Q24_Part_2'].replace('Decision Trees or Random Forests',1)
data['Q24_Part_2'] = data['Q24_Part_2'].astype('category')

data['Q24_Part_3'].fillna(0,inplace=True)
data['Q24_Part_3'].replace('Gradient Boosting Machines (xgboost, lightgbm, etc)',1)
data['Q24_Part_3'] = data['Q24_Part_3'].astype('category')

data['Q24_Part_4'].fillna(0,inplace=True)
data['Q24_Part_4'].replace('Bayesian Approaches',1)
data['Q24_Part_4'] = data['Q24_Part_4'].astype('category')

data['Q24_Part_5'].fillna(0,inplace=True)
data['Q24_Part_5'].replace('Evolutionary Approaches',1)
data['Q24_Part_5'] = data['Q24_Part_5'].astype('category')

data['Q24_Part_6'].fillna(0,inplace=True)
data['Q24_Part_6'].replace('Dense Neural Networks (MLPs, etc)',1)
data['Q24_Part_6'] = data['Q24_Part_6'].astype('category')

data['Q24_Part_7'].fillna(0,inplace=True)
data['Q24_Part_7'].replace('Convolutional Neural Networks',1)
data['Q24_Part_7'] = data['Q24_Part_7'].astype('category')

data['Q24_Part_8'].fillna(0,inplace=True)
data['Q24_Part_8'].replace('Generative Adversarial Networks',1)
data['Q24_Part_8'] = data['Q24_Part_8'].astype('category')

data['Q24_Part_9'].fillna(0,inplace=True)
data['Q24_Part_9'].replace('Recurrent Neural Networks',1)
data['Q24_Part_9'] = data['Q24_Part_9'].astype('category')

data['Q24_Part_10'].fillna(0,inplace=True)
data['Q24_Part_10'].replace('Transformer Networks (BERT, gpt-2, etc)',1)
data['Q24_Part_10'] = data['Q24_Part_10'].astype('category')

1       1
2       1
3       1
4       1
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_1, Length: 3454, dtype: int64

1       1
2       1
3       1
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_2, Length: 3454, dtype: int64

1       0
2       1
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_3, Length: 3454, dtype: int64

1       0
2       1
3       1
4       1
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_4, Length: 3454, dtype: int64

1       0
2       1
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_5, Length: 3454, dtype: int64

1       0
2       1
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_6, Length: 3454, dtype: int64

1       1
2       1
3       0
4       0
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_7, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_8, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_9, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q24_Part_10, Length: 3454, dtype: int64

In [17]:
# delete useless and similar columns
del data['Q25_Part_1']
del data['Q25_Part_2']
del data['Q25_Part_3']
del data['Q25_Part_4']
del data['Q25_Part_5']
del data['Q25_Part_6']
del data['Q25_Part_7']
del data['Q25_Part_8']


### Which categories of computer vision methods do you use on a regular basis?

In [18]:
# Which categories of computer vision methods do you use on a regular basis?
data['Q26_Part_1'].fillna(0,inplace=True)
data['Q26_Part_1'].replace('General purpose image/video tools (PIL, cv2, skimage, etc)',1)
data['Q26_Part_1'] = data['Q26_Part_1'].astype('category')

data['Q26_Part_2'].fillna(0,inplace=True)
data['Q26_Part_2'].replace('Image segmentation methods (U-Net, Mask R-CNN, etc)',1)
data['Q26_Part_2'] = data['Q26_Part_2'].astype('category')

data['Q26_Part_3'].fillna(0,inplace=True)
data['Q26_Part_3'].replace('Object detection methods (YOLOv3, RetinaNet, etc)',1)
data['Q26_Part_3'] = data['Q26_Part_3'].astype('category')

data['Q26_Part_4'].fillna(0,inplace=True)
data['Q26_Part_4'].replace('Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc)',1)
data['Q26_Part_4'] = data['Q26_Part_4'].astype('category')

data['Q26_Part_5'].fillna(0,inplace=True)
data['Q26_Part_5'].replace('Generative Networks (GAN, VAE, etc)',1)
data['Q26_Part_5'] = data['Q26_Part_5'].astype('category')

1       1
2       1
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q26_Part_1, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q26_Part_2, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q26_Part_3, Length: 3454, dtype: int64

1       1
2       1
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q26_Part_4, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q26_Part_5, Length: 3454, dtype: int64

### Which of the following natural language processing (NLP) methods do you use on a regular basis?

In [19]:
# Which of the following natural language processing (NLP) methods do you use on a regular basis?
data['Q27_Part_1'].fillna(0,inplace=True)
data['Q27_Part_1'].replace('Word embeddings/vectors (GLoVe, fastText, word2vec)',1)
data['Q27_Part_1'] = data['Q27_Part_1'].astype('category')

data['Q27_Part_2'].fillna(0,inplace=True)
data['Q27_Part_2'].replace('Encoder-decorder models (seq2seq, vanilla transformers)',1)
data['Q27_Part_2'] = data['Q27_Part_2'].astype('category')

data['Q27_Part_3'].fillna(0,inplace=True)
data['Q27_Part_3'].replace('Contextualized embeddings (ELMo, CoVe)',1)
data['Q27_Part_3'] = data['Q27_Part_3'].astype('category')

data['Q27_Part_4'].fillna(0,inplace=True)
data['Q27_Part_4'].replace('Transformer language models (GPT-2, BERT, XLnet, etc)',1)
data['Q27_Part_4'] = data['Q27_Part_4'].astype('category')


1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q27_Part_1, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q27_Part_2, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q27_Part_3, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q27_Part_4, Length: 3454, dtype: int64

### Which of the following machine learning frameworks do you use on a regular basis?  

In [20]:
# Which of the following machine learning frameworks do you use on a regular basis?  
data['Q28_Part_1'].fillna(0,inplace=True)
data['Q28_Part_1'].replace('  Scikit-learn ',1)
data['Q28_Part_1'] = data['Q28_Part_1'].astype('category')

data['Q28_Part_2'].fillna(0,inplace=True)
data['Q28_Part_2'].replace('  TensorFlow ',1)
data['Q28_Part_2'] = data['Q28_Part_2'].astype('category')

data['Q28_Part_3'].fillna(0,inplace=True)
data['Q28_Part_3'].replace(' Keras ',1)
data['Q28_Part_3'] = data['Q28_Part_3'].astype('category')

data['Q28_Part_4'].fillna(0,inplace=True)
data['Q28_Part_4'].replace(' RandomForest',1)
data['Q28_Part_4'] = data['Q28_Part_4'].astype('category')

data['Q28_Part_5'].fillna(0,inplace=True)
data['Q28_Part_5'].replace(' Xgboost ',1)
data['Q28_Part_5'] = data['Q28_Part_5'].astype('category')

data['Q28_Part_6'].fillna(0,inplace=True)
data['Q28_Part_6'].replace(' PyTorch ',1)
data['Q28_Part_6'] = data['Q28_Part_6'].astype('category')

data['Q28_Part_7'].fillna(0,inplace=True)
data['Q28_Part_7'].replace(' Caret ',1)
data['Q28_Part_7'] = data['Q28_Part_7'].astype('category')

data['Q28_Part_8'].fillna(0,inplace=True)
data['Q28_Part_8'].replace(' LightGBM ',1)
data['Q28_Part_8'] = data['Q28_Part_8'].astype('category')

data['Q28_Part_9'].fillna(0,inplace=True)
data['Q28_Part_9'].replace(' Spark MLib ',1)
data['Q28_Part_9'] = data['Q28_Part_9'].astype('category')

data['Q28_Part_10'].fillna(0,inplace=True)
data['Q28_Part_10'].replace(' Fast.ai ',1)
data['Q28_Part_10'] = data['Q28_Part_10'].astype('category')

1       1
2       1
3       1
4       1
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_1, Length: 3454, dtype: int64

1       1
2       0
3       0
4       0
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_2, Length: 3454, dtype: int64

1       1
2       0
3       0
4       0
5       1
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_3, Length: 3454, dtype: int64

1       0
2       0
3       1
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_4, Length: 3454, dtype: int64

1       0
2       1
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_5, Length: 3454, dtype: int64

1       0
2       1
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_6, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_7, Length: 3454, dtype: int64

1       0
2       1
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_8, Length: 3454, dtype: int64

1       1
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_9, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q28_Part_10, Length: 3454, dtype: int64

### Which of the following cloud computing platforms do you use on a regular basis?

In [21]:
# Which of the following cloud computing platforms do you use on a regular basis?
data['Q29_Part_1'].fillna(0,inplace=True)
data['Q29_Part_1'].replace(' Google Cloud Platform (GCP) ',1)
data['Q29_Part_1'] = data['Q29_Part_1'].astype('category')

data['Q29_Part_2'].fillna(0,inplace=True)
data['Q29_Part_2'].replace(' Amazon Web Services (AWS) ',1)
data['Q29_Part_2'] = data['Q29_Part_2'].astype('category')

data['Q29_Part_3'].fillna(0,inplace=True)
data['Q29_Part_3'].replace(' Microsoft Azure ',1)
data['Q29_Part_3'] = data['Q29_Part_3'].astype('category')

del data['Q29_Part_4']
del data['Q29_Part_5']
del data['Q29_Part_6']
del data['Q29_Part_7']
del data['Q29_Part_8']
del data['Q29_Part_9']
del data['Q29_Part_10']

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q29_Part_1, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q29_Part_2, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q29_Part_3, Length: 3454, dtype: int64

### Which of the following relational database products do you use on a regular basis? 

In [22]:
# Which of the following relational database products do you use on a regular basis? 
data['Q34_Part_1'].fillna(0,inplace=True)
data['Q34_Part_1'].replace('MySQL',1)
data['Q34_Part_1'] = data['Q34_Part_1'].astype('category')

data['Q34_Part_2'].fillna(0,inplace=True)
data['Q34_Part_2'].replace('PostgresSQL',1)
data['Q34_Part_2'] = data['Q34_Part_2'].astype('category')

data['Q34_Part_3'].fillna(0,inplace=True)
data['Q34_Part_3'].replace('SQLite',1)
data['Q34_Part_3'] = data['Q34_Part_3'].astype('category')

data['Q34_Part_4'].fillna(0,inplace=True)
data['Q34_Part_4'].replace('Microsoft SQL Server',1)
data['Q34_Part_4'] = data['Q34_Part_4'].astype('category')

data['Q34_Part_5'].fillna(0,inplace=True)
data['Q34_Part_5'].replace('Oracle Database',1)
data['Q34_Part_5'] = data['Q34_Part_5'].astype('category')

data['Q34_Part_6'].fillna(0,inplace=True)
data['Q34_Part_6'].replace('Microsoft Access',1)
data['Q34_Part_6'] = data['Q34_Part_6'].astype('category')

data['Q34_Part_7'].fillna(0,inplace=True)
data['Q34_Part_7'].replace('AWS Relational Database Service',1)
data['Q34_Part_7'] = data['Q34_Part_7'].astype('category')

data['Q34_Part_8'].fillna(0,inplace=True)
data['Q34_Part_8'].replace('AWS DynamoDB',1)
data['Q34_Part_8'] = data['Q34_Part_8'].astype('category')

data['Q34_Part_9'].fillna(0,inplace=True)
data['Q34_Part_9'].replace('Azure SQL Database',1)
data['Q34_Part_9'] = data['Q34_Part_9'].astype('category')

data['Q34_Part_10'].fillna(0,inplace=True)
data['Q34_Part_10'].replace('Google Cloud SQL',1)
data['Q34_Part_10'] = data['Q34_Part_10'].astype('category')

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_1, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_2, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_3, Length: 3454, dtype: int64

1       0
2       0
3       0
4       1
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_4, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_5, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_6, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_7, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_8, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_9, Length: 3454, dtype: int64

1       0
2       0
3       0
4       0
5       0
       ..
3529    0
3530    0
3531    0
3533    0
3534    0
Name: Q34_Part_10, Length: 3454, dtype: int64

## Check value and data type in each columns

In [23]:
for col in data:
    print(pd.unique(data[col]))

[22-24, 30-34, 50-54, 35-39, 25-29, ..., 55-59, 45-49, 40-44, 18-21, 70+]
Length: 11
Categories (11, object): [22-24, 30-34, 50-54, 35-39, ..., 45-49, 40-44, 18-21, 70+]
[Female, Male, Prefer not to say, Prefer to self-describe]
Categories (4, object): [Female, Male, Prefer not to say, Prefer to self-describe]
[United States of America, Canada]
Categories (2, object): [United States of America, Canada]
[Bachelors degree, Masters degree, Doctoral degree, I prefer not to answer, Professional degree, Some college/university study without earning ..., No formal education past high school]
Categories (7, object): [Bachelors degree, Masters degree, Doctoral degree, I prefer not to answer, Professional degree, Some college/university study without earning ..., No formal education past high school]
['Data Scientist' 'Student' 'Product/Project Manager' 'Data Analyst'
 'Other' 'Research Scientist' 'Data Engineer' 'Business Analyst'
 'Software Engineer' 'Statistician' 'Not employed' 'DBA/Database

In [25]:
data.shape

(3454, 101)

In [28]:
data

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q8,Q9_Part_1,Q9_Part_2,Q9_Part_3,...,Q34_Part_1,Q34_Part_2,Q34_Part_3,Q34_Part_4,Q34_Part_5,Q34_Part_6,Q34_Part_7,Q34_Part_8,Q34_Part_9,Q34_Part_10
1,22-24,Female,United States of America,Bachelors degree,Data Scientist,"> 10,000 employees",3,True,False,True,...,0,0,0,0,0,0,0,0,0,0
2,22-24,Male,United States of America,Bachelors degree,Student,Not employed,0,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3,30-34,Male,United States of America,Masters degree,Product/Project Manager,"> 10,000 employees",0,False,False,False,...,0,0,0,0,0,0,0,0,0,0
4,50-54,Female,United States of America,Masters degree,Data Analyst,50-249 employees,2,True,True,False,...,0,0,0,Microsoft SQL Server,0,0,0,0,0,0
5,35-39,Male,United States of America,Bachelors degree,Other,250-999 employees,4,True,False,False,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3529,25-29,Male,United States of America,Masters degree,Student,Not employed,0,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3530,18-21,Male,United States of America,Masters degree,Student,Not employed,0,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3531,25-29,Female,United States of America,Masters degree,Student,Not employed,0,False,False,False,...,0,0,0,0,0,0,0,0,0,0
3533,22-24,Female,Canada,Masters degree,Research Scientist,0-49 employees,1,False,False,False,...,0,0,0,0,0,0,0,0,0,0


In [29]:
data.shape

(3454, 101)

## Save as new csv file

In [33]:
data.to_csv('dataset_clean.csv')

In [32]:
%pwd

'/Users/cheng-chunchao/Desktop/data-x'

# Questions remained after cleaning
### Q1
### Q2
### Q3
### Q4
### Q5
### Q6
### Q8
### Q9_Part_1
### Q9_Part_2
### Q9_Part_3
### Q9_Part_4
### Q9_Part_5
### Q9_Part_6
### Q9_Part_7
### Q10
### Q11
### Q13_Part_1
### Q13_Part_2
### Q13_Part_3
### Q13_Part_4
### Q13_Part_5
### Q13_Part_6
### Q13_Part_7
### Q13_Part_8
### Q13_Part_9
### Q13_Part_10
### Q14
### Q15
### Q16_Part_1
### Q16_Part_2
### Q16_Part_3
### Q16_Part_4
### Q16_Part_5
### Q16_Part_6
### Q16_Part_7
### Q16_Part_8
### Q16_Part_9
### Q16_Part_10
### Q18_Part_1
### Q18_Part_2
### Q18_Part_3
### Q18_Part_4
### Q18_Part_5
### Q18_Part_6
### Q18_Part_7
### Q18_Part_8
### Q18_Part_9
### Q18_Part_10
### Q20_Part_1
### Q20_Part_2
### Q20_Part_3
### Q20_Part_4
### Q20_Part_5
### Q20_Part_6
### Q20_Part_7
### Q20_Part_8
### Q20_Part_9
### Q20_Part_10
### Q23
### Q24_Part_1
### Q24_Part_2
### Q24_Part_3
### Q24_Part_4
### Q24_Part_5
### Q24_Part_6
### Q24_Part_7
### Q24_Part_8
### Q24_Part_9
### Q24_Part_10
### Q26_Part_1
### Q26_Part_2
### Q26_Part_3
### Q26_Part_4
### Q26_Part_5
### Q27_Part_1
### Q27_Part_2
### Q27_Part_3
### Q27_Part_4
### Q28_Part_1
### Q28_Part_2
### Q28_Part_3
### Q28_Part_4
### Q28_Part_5
### Q28_Part_6
### Q28_Part_7
### Q28_Part_8
### Q28_Part_9
### Q28_Part_10
### Q29_Part_1
### Q29_Part_2
### Q29_Part_3
### Q34_Part_1
### Q34_Part_2
### Q34_Part_3
### Q34_Part_4
### Q34_Part_5
### Q34_Part_6
### Q34_Part_7
### Q34_Part_8
### Q34_Part_9
### Q34_Part_10