In [1]:
# import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# prepare defaults
base_color = sns.color_palette()[0]

In [3]:
# Common functions
def print_cols(df):
    for i in range(len(df.columns)):
        print(i, ': ', df.columns[i])
        
def get_tech_dummies(x):
    x = x.notna()
    x = x.apply(lambda y: 1 if y == True else 0)
    return x

### read and process 2011 data

In [4]:
cols = [2, 3] + list(np.arange(30,43)) +  [44, 45]
df_2011 = pd.read_csv('2011_results.csv', encoding='latin_1', usecols=cols)
df_2011.head()

Unnamed: 0,How old are you?,How many years of IT/Programming experience do you have?,Which languages are you proficient in?,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Please rate your job/career satisfaction,"Including bonus, what is your annual compensation in USD?"
0,Response,Response,Java,JavaScript,CSS,PHP,Python,Ruby,SQL,C#,C++,C,Perl,,other (please specify),Response,Response
1,< 20,<2,,JavaScript,,,,,,,,,,,,FML,Student / Unemployed
2,25-29,41310,Java,,,,,,SQL,,,C,,,,So happy it hurts,
3,25-29,41435,Java,JavaScript,,,,,SQL,,,,,,,,
4,< 20,41310,Java,,,,,,,,,,,,Haskell,I enjoy going to work,Student / Unemployed


In [5]:
# Fix column names
row1_col_names = df_2011.iloc[0]
code_cols = list(row1_col_names[2:-3])
df_2011.columns = ['Age', 'Experience'] + code_cols + ['Other', 'Satisfaction', 'Compensation']
df_2011.head()

Unnamed: 0,Age,Experience,Java,JavaScript,CSS,PHP,Python,Ruby,SQL,C#,C++,C,Perl,None,Other,Satisfaction,Compensation
0,Response,Response,Java,JavaScript,CSS,PHP,Python,Ruby,SQL,C#,C++,C,Perl,,other (please specify),Response,Response
1,< 20,<2,,JavaScript,,,,,,,,,,,,FML,Student / Unemployed
2,25-29,41310,Java,,,,,,SQL,,,C,,,,So happy it hurts,
3,25-29,41435,Java,JavaScript,,,,,SQL,,,,,,,,
4,< 20,41310,Java,,,,,,,,,,,,Haskell,I enjoy going to work,Student / Unemployed


In [6]:
# cleanup sub header
df_2011 = df_2011.drop(index=0)
df_2011.head()

Unnamed: 0,Age,Experience,Java,JavaScript,CSS,PHP,Python,Ruby,SQL,C#,C++,C,Perl,None,Other,Satisfaction,Compensation
1,< 20,<2,,JavaScript,,,,,,,,,,,,FML,Student / Unemployed
2,25-29,41310,Java,,,,,,SQL,,,C,,,,So happy it hurts,
3,25-29,41435,Java,JavaScript,,,,,SQL,,,,,,,,
4,< 20,41310,Java,,,,,,,,,,,,Haskell,I enjoy going to work,Student / Unemployed
5,35-39,11,Java,JavaScript,CSS,PHP,,,SQL,,C++,C,Perl,,,It pays the bills,"$80,000 - $100,000"


In [7]:
# make language columns as boolean
df_2011 = df_2011.apply(lambda x: get_tech_dummies(x) if x.name in code_cols else x)
df_2011.head()

Unnamed: 0,Age,Experience,Java,JavaScript,CSS,PHP,Python,Ruby,SQL,C#,C++,C,Perl,None,Other,Satisfaction,Compensation
1,< 20,<2,0,1,0,0,0,0,0,0,0,0,0,0,,FML,Student / Unemployed
2,25-29,41310,1,0,0,0,0,0,1,0,0,1,0,0,,So happy it hurts,
3,25-29,41435,1,1,0,0,0,0,1,0,0,0,0,0,,,
4,< 20,41310,1,0,0,0,0,0,0,0,0,0,0,0,Haskell,I enjoy going to work,Student / Unemployed
5,35-39,11,1,1,1,1,0,0,1,0,1,1,1,0,,It pays the bills,"$80,000 - $100,000"


In [8]:
df_2011.describe()

Unnamed: 0,Java,JavaScript,CSS,PHP,Python,Ruby,SQL,C#,C++,C,Perl,None
count,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0,2813.0
mean,0.306079,0.504444,0.482048,0.280128,0.204408,0.101315,0.573409,0.480626,0.263064,0.26342,0.10096,0.00711
std,0.460945,0.500069,0.499766,0.449141,0.40334,0.301799,0.49467,0.499713,0.440375,0.440566,0.301329,0.084035
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### read and process 2012 data

In [9]:
cols = [2, 3] + list(np.arange(22,37)) + [38, 39]
df_2012 = pd.read_csv('2012_results.csv', encoding='latin_1', usecols=cols)
df_2012.head()

Unnamed: 0,How old are you?,How many years of IT/Programming experience do you have?,Which languages are you proficient in?,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,What best describes your career / job satisfaction?,"Including bonus, what is your annual compensation in USD?"
0,Response,Response,Java,JavaScript,CSS,PHP,Python,Objective-C,Ruby,SQL,C#,C++,C,Perl,HTML5,,Other (please specify),Response,Response
1,20-24,<2,Java,JavaScript,,,,,,SQL,,C++,C,,,,,Love my job,"<$20,000"
2,25-29,<2,,JavaScript,CSS,PHP,,Objective-C,,,,C++,,,HTML5,,,I enjoy going to work,"$20,000 - $40,000"
3,20-24,41070,,,CSS,PHP,,Objective-C,,SQL,,,,,HTML5,,,I enjoy going to work,"$20,000 - $40,000"
4,20-24,40944,Java,,,,,,,,,C++,,,,,,I'm not happy in my job,"$20,000 - $40,000"


In [10]:
# Fix column names
row1_col_names = df_2012.iloc[0]
code_cols = list(row1_col_names[2:-3])
df_2012.columns = ['Age', 'Experience'] + code_cols + ['Other', 'Satisfaction', 'Compensation']
df_2012.head()

Unnamed: 0,Age,Experience,Java,JavaScript,CSS,PHP,Python,Objective-C,Ruby,SQL,C#,C++,C,Perl,HTML5,None,Other,Satisfaction,Compensation
0,Response,Response,Java,JavaScript,CSS,PHP,Python,Objective-C,Ruby,SQL,C#,C++,C,Perl,HTML5,,Other (please specify),Response,Response
1,20-24,<2,Java,JavaScript,,,,,,SQL,,C++,C,,,,,Love my job,"<$20,000"
2,25-29,<2,,JavaScript,CSS,PHP,,Objective-C,,,,C++,,,HTML5,,,I enjoy going to work,"$20,000 - $40,000"
3,20-24,41070,,,CSS,PHP,,Objective-C,,SQL,,,,,HTML5,,,I enjoy going to work,"$20,000 - $40,000"
4,20-24,40944,Java,,,,,,,,,C++,,,,,,I'm not happy in my job,"$20,000 - $40,000"


In [11]:
# cleanup sub header
df_2012 = df_2012.drop(index=0)
df_2012.head()

Unnamed: 0,Age,Experience,Java,JavaScript,CSS,PHP,Python,Objective-C,Ruby,SQL,C#,C++,C,Perl,HTML5,None,Other,Satisfaction,Compensation
1,20-24,<2,Java,JavaScript,,,,,,SQL,,C++,C,,,,,Love my job,"<$20,000"
2,25-29,<2,,JavaScript,CSS,PHP,,Objective-C,,,,C++,,,HTML5,,,I enjoy going to work,"$20,000 - $40,000"
3,20-24,41070,,,CSS,PHP,,Objective-C,,SQL,,,,,HTML5,,,I enjoy going to work,"$20,000 - $40,000"
4,20-24,40944,Java,,,,,,,,,C++,,,,,,I'm not happy in my job,"$20,000 - $40,000"
5,< 20,40944,Java,,,,,,,,,,,,,,,I wish I had a job!,Student / Unemployed


In [12]:
# make language columns as boolean
df_2012 = df_2012.apply(lambda x: get_tech_dummies(x) if x.name in code_cols else x) 
df_2012.head()

Unnamed: 0,Age,Experience,Java,JavaScript,CSS,PHP,Python,Objective-C,Ruby,SQL,C#,C++,C,Perl,HTML5,None,Other,Satisfaction,Compensation
1,20-24,<2,1,1,0,0,0,0,0,1,0,1,1,0,0,0,,Love my job,"<$20,000"
2,25-29,<2,0,1,1,1,0,1,0,0,0,1,0,0,1,0,,I enjoy going to work,"$20,000 - $40,000"
3,20-24,41070,0,0,1,1,0,1,0,1,0,0,0,0,1,0,,I enjoy going to work,"$20,000 - $40,000"
4,20-24,40944,1,0,0,0,0,0,0,0,0,1,0,0,0,0,,I'm not happy in my job,"$20,000 - $40,000"
5,< 20,40944,1,0,0,0,0,0,0,0,0,0,0,0,0,0,,I wish I had a job!,Student / Unemployed


In [13]:
df_2012.describe()

Unnamed: 0,Java,JavaScript,CSS,PHP,Python,Objective-C,Ruby,SQL,C#,C++,C,Perl,HTML5,None
count,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0,6243.0
mean,0.376101,0.500881,0.452347,0.308345,0.193817,0.096909,0.092424,0.527631,0.395323,0.244434,0.237706,0.073843,0.345026,0.000801
std,0.484445,0.500039,0.497764,0.461847,0.395319,0.295857,0.289646,0.499276,0.488959,0.429785,0.425712,0.261536,0.475415,0.028291
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### read and process 2013 data

In [14]:
cols = [2, 3] + list(np.arange(56,70)) + [99, 100] # excited about = list(np.arange(70,81))
df_2013 = pd.read_csv('2013_results.csv', encoding='latin_1', usecols=cols)
df_2013.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,How old are you?,How many years of IT/Programming experience do you have?,Which of the following languages or technologies have you used significantly in the past year?,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,What best describes your career / job satisfaction?,"Including bonus, what is your annual compensation in USD?"
0,Response,Response,C,C++,C#,Java,JavaScript,jQuery,JQuery,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other (please specify),Response,Response
1,35-39,6/10/2013,,,,Java,,,,,,,,,SQL,,It's a paycheck,"$80,000 - $100,000"
2,25-29,6/10/2013,,,C#,,JavaScript,jQuery,,,,PHP,,,,MySql / VbScript,It's a paycheck,"$20,000 - $40,000"
3,51-60,11,,,C#,,JavaScript,jQuery,,,,,,,SQL,"PL/SQL, XSLT, XQuery",I'm not happy in my job,"$120,000 - $140,000"
4,,,,,,,,,,,,,,,,,,


In [15]:
# Fix column names
row1_col_names = df_2013.iloc[0]
code_cols = list(row1_col_names[2:-3])
df_2013.columns = ['Age', 'Experience'] + code_cols + ['Other', 'Satisfaction', 'Compensation']
df_2013.head()

Unnamed: 0,Age,Experience,C,C++,C#,Java,JavaScript,jQuery,JQuery,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other,Satisfaction,Compensation
0,Response,Response,C,C++,C#,Java,JavaScript,jQuery,JQuery,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other (please specify),Response,Response
1,35-39,6/10/2013,,,,Java,,,,,,,,,SQL,,It's a paycheck,"$80,000 - $100,000"
2,25-29,6/10/2013,,,C#,,JavaScript,jQuery,,,,PHP,,,,MySql / VbScript,It's a paycheck,"$20,000 - $40,000"
3,51-60,11,,,C#,,JavaScript,jQuery,,,,,,,SQL,"PL/SQL, XSLT, XQuery",I'm not happy in my job,"$120,000 - $140,000"
4,,,,,,,,,,,,,,,,,,


In [16]:
# merge use_JQuery and use_jQuery columns
df_2013['JQuery'] = df_2013['JQuery'].where(df_2013['JQuery'].notna(), df_2013['jQuery'])
df_2013 = df_2013.drop(columns=['jQuery'])
df_2013.head()

Unnamed: 0,Age,Experience,C,C++,C#,Java,JavaScript,JQuery,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other,Satisfaction,Compensation
0,Response,Response,C,C++,C#,Java,JavaScript,JQuery,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other (please specify),Response,Response
1,35-39,6/10/2013,,,,Java,,,,,,,,SQL,,It's a paycheck,"$80,000 - $100,000"
2,25-29,6/10/2013,,,C#,,JavaScript,jQuery,,,PHP,,,,MySql / VbScript,It's a paycheck,"$20,000 - $40,000"
3,51-60,11,,,C#,,JavaScript,jQuery,,,,,,SQL,"PL/SQL, XSLT, XQuery",I'm not happy in my job,"$120,000 - $140,000"
4,,,,,,,,,,,,,,,,,


In [17]:
# make language columns as boolean
df_2013 = df_2013.apply(lambda x: get_tech_dummies(x) if x.name in code_cols else x) 
df_2013.head()

Unnamed: 0,Age,Experience,C,C++,C#,Java,JavaScript,JQuery,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other,Satisfaction,Compensation
0,Response,Response,1,1,1,1,1,1,1,1,1,1,1,1,Other (please specify),Response,Response
1,35-39,6/10/2013,0,0,0,1,0,0,0,0,0,0,0,1,,It's a paycheck,"$80,000 - $100,000"
2,25-29,6/10/2013,0,0,1,0,1,1,0,0,1,0,0,0,MySql / VbScript,It's a paycheck,"$20,000 - $40,000"
3,51-60,11,0,0,1,0,1,1,0,0,0,0,0,1,"PL/SQL, XSLT, XQuery",I'm not happy in my job,"$120,000 - $140,000"
4,,,0,0,0,0,0,0,0,0,0,0,0,0,,,


In [18]:
df_2013.describe()

Unnamed: 0,C,C++,C#,Java,JavaScript,JQuery,Node.js,Objective-C,PHP,Python,Ruby,SQL
count,9743.0,9743.0,9743.0,9743.0,9743.0,9743.0,9743.0,9743.0,9743.0,9743.0,9743.0,9743.0
mean,0.147491,0.174176,0.310787,0.310069,0.486093,0.425947,0.061583,0.098122,0.238633,0.193062,0.081597,0.471518
std,0.354612,0.379281,0.46284,0.462545,0.499832,0.494511,0.240408,0.297494,0.42627,0.394721,0.273764,0.499214
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Read and process 2014 data

In [19]:
cols=[3,4,5,7] + list(np.arange(42,54))
df_2014 = pd.read_csv('2014_results.csv', usecols=cols)
df_2014.head()

Unnamed: 0,How old are you?,What is your gender?,How many years of IT/Programming experience do you have?,"Including bonus, what is your annual compensation in USD?",Which of the following languages or technologies have you used significantly in the past year?,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53
0,Response,Response,Response,Response,C,C++,C#,Java,JavaScript,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other (please specify)
1,30-34,Female,6/10/2014,"$20,000 - $40,000",,,,Java,JavaScript,,,PHP,Python,,,
2,20-24,Male,<2,Student / Unemployed,,,,,,,,PHP,,,,
3,25-29,Male,6/10/2014,"<$20,000",,,C#,,JavaScript,,,,,,SQL,
4,< 20,Male,<2,Student / Unemployed,,C++,,,,,,,,,,


In [20]:
# Fix column names
row1_col_names = df_2014.iloc[0]
code_cols = list(row1_col_names[4:-1])
df_2014.columns = ['Age', 'Gender', 'Experience', 'Compensation'] + code_cols + ['Other']
df_2014.head()

Unnamed: 0,Age,Gender,Experience,Compensation,C,C++,C#,Java,JavaScript,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other
0,Response,Response,Response,Response,C,C++,C#,Java,JavaScript,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other (please specify)
1,30-34,Female,6/10/2014,"$20,000 - $40,000",,,,Java,JavaScript,,,PHP,Python,,,
2,20-24,Male,<2,Student / Unemployed,,,,,,,,PHP,,,,
3,25-29,Male,6/10/2014,"<$20,000",,,C#,,JavaScript,,,,,,SQL,
4,< 20,Male,<2,Student / Unemployed,,C++,,,,,,,,,,


In [21]:
# make language columns as boolean
df_2014 = df_2014.apply(lambda x: get_tech_dummies(x) if x.name in code_cols else x) 
df_2014.head()

Unnamed: 0,Age,Gender,Experience,Compensation,C,C++,C#,Java,JavaScript,Node.js,Objective-C,PHP,Python,Ruby,SQL,Other
0,Response,Response,Response,Response,1,1,1,1,1,1,1,1,1,1,1,Other (please specify)
1,30-34,Female,6/10/2014,"$20,000 - $40,000",0,0,0,1,1,0,0,1,1,0,0,
2,20-24,Male,<2,Student / Unemployed,0,0,0,0,0,0,0,1,0,0,0,
3,25-29,Male,6/10/2014,"<$20,000",0,0,1,0,1,0,0,0,0,0,1,
4,< 20,Male,<2,Student / Unemployed,0,1,0,0,0,0,0,0,0,0,0,


In [22]:
df_2014.describe()

Unnamed: 0,C,C++,C#,Java,JavaScript,Node.js,Objective-C,PHP,Python,Ruby,SQL
count,7644.0,7644.0,7644.0,7644.0,7644.0,7644.0,7644.0,7644.0,7644.0,7644.0,7644.0
mean,0.153715,0.185636,0.311224,0.324437,0.495029,0.083987,0.088435,0.223836,0.192831,0.077708,0.453689
std,0.360699,0.388838,0.463025,0.468195,0.500008,0.277387,0.283946,0.416841,0.394547,0.267729,0.497883
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Read and process 2015 data

In [23]:
cols = [1,2,4] + list(np.arange(8,51)) + [105,109]
df_2015 = pd.read_csv('2015_results.csv', skiprows=1, usecols=cols)
df_2015.head()

Unnamed: 0,Age,Gender,Years IT / Programming Experience,Current Lang & Tech: Android,Current Lang & Tech: Arduino,Current Lang & Tech: AngularJS,Current Lang & Tech: C,Current Lang & Tech: C++,Current Lang & Tech: C++11,Current Lang & Tech: C#,...,Current Lang & Tech: Spark,Current Lang & Tech: SQL,Current Lang & Tech: SQL Server,Current Lang & Tech: Swift,Current Lang & Tech: Visual Basic,Current Lang & Tech: Windows Phone,Current Lang & Tech: Wordpress,Current Lang & Tech: Write-In,Compensation,Job Satisfaction
0,25-29,Male,2 - 5 years,,,,,,,,...,,,,,,,,,"$20,000 - $40,000",I'm somewhat satisfied with my job
1,20-24,Male,1 - 2 years,,,,,,,C#,...,,,,,,,,,"$20,000 - $40,000",I'm neither satisfied nor dissatisfied with my...
2,20-24,Male,1 - 2 years,,,,,,,C#,...,,,,,,,,,"$80,000 - $100,000",I'm somewhat satisfied with my job
3,25-29,Male,6 - 10 years,,,,,,,,...,,SQL,,,,,,lua,,I'm somewhat satisfied with my job
4,30-34,Male,2 - 5 years,,,,,,,C#,...,,,,,,,,,"$60,000 - $80,000",I love my job


In [24]:
# Fix column names
df_2015 = df_2015.rename(mapper = lambda x: x.split(':')[1].strip() if 'Current Lang & Tech' in x else x, axis=1)
code_cols = list(df_2015.columns[3:-2])
df_2015.columns = ['Age', 'Gender', 'Experience'] + code_cols + ['Compensation', 'Satisfaction']
df_2015.head()

Unnamed: 0,Age,Gender,Experience,Android,Arduino,AngularJS,C,C++,C++11,C#,...,Spark,SQL,SQL Server,Swift,Visual Basic,Windows Phone,Wordpress,Write-In,Compensation,Satisfaction
0,25-29,Male,2 - 5 years,,,,,,,,...,,,,,,,,,"$20,000 - $40,000",I'm somewhat satisfied with my job
1,20-24,Male,1 - 2 years,,,,,,,C#,...,,,,,,,,,"$20,000 - $40,000",I'm neither satisfied nor dissatisfied with my...
2,20-24,Male,1 - 2 years,,,,,,,C#,...,,,,,,,,,"$80,000 - $100,000",I'm somewhat satisfied with my job
3,25-29,Male,6 - 10 years,,,,,,,,...,,SQL,,,,,,lua,,I'm somewhat satisfied with my job
4,30-34,Male,2 - 5 years,,,,,,,C#,...,,,,,,,,,"$60,000 - $80,000",I love my job


In [25]:
# make language columns as boolean
df_2015 = df_2015.apply(lambda x: get_tech_dummies(x) if x.name in code_cols else x) 
df_2015.head()

Unnamed: 0,Age,Gender,Experience,Android,Arduino,AngularJS,C,C++,C++11,C#,...,Spark,SQL,SQL Server,Swift,Visual Basic,Windows Phone,Wordpress,Write-In,Compensation,Satisfaction
0,25-29,Male,2 - 5 years,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"$20,000 - $40,000",I'm somewhat satisfied with my job
1,20-24,Male,1 - 2 years,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,"$20,000 - $40,000",I'm neither satisfied nor dissatisfied with my...
2,20-24,Male,1 - 2 years,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,"$80,000 - $100,000",I'm somewhat satisfied with my job
3,25-29,Male,6 - 10 years,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,,I'm somewhat satisfied with my job
4,30-34,Male,2 - 5 years,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,"$60,000 - $80,000",I love my job


In [26]:
df_2015.describe()

Unnamed: 0,Android,Arduino,AngularJS,C,C++,C++11,C#,Cassandra,CoffeeScript,Cordova,...,Scala,Sharepoint,Spark,SQL,SQL Server,Swift,Visual Basic,Windows Phone,Wordpress,Write-In
count,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,...,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0,26086.0
mean,0.157556,0.062332,0.111669,0.138465,0.173618,0.070958,0.266388,0.007744,0.030016,0.024074,...,0.020624,0.013379,0.003987,0.361842,0.158284,0.029096,0.065207,0.021851,0.076938,0.082343
std,0.364331,0.241763,0.314965,0.345394,0.378788,0.256759,0.442078,0.087658,0.170635,0.153283,...,0.142125,0.114893,0.063016,0.480543,0.365014,0.168079,0.246896,0.146199,0.266498,0.274892
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Read and process 2016 data

In [27]:
cols = [5,7,11,13,26,16]
df_2016 = pd.read_csv('2016_results.csv', usecols=cols)
df_2016.head()

Unnamed: 0,age_range,gender,experience_range,salary_range,tech_do,job_satisfaction
0,20-24,Male,,,,
1,30-34,Male,6 - 10 years,"$40,000 - $50,000",iOS; Objective-C,I love my job
2,,,,,,
3,,Female,11+ years,"Less than $10,000",,I love my job
4,> 60,Prefer not to disclose,,,,


In [28]:
# rename columns
df_2016.columns = ['Age', 'Gender', 'Experience', 'Compensation', 'Tech', 'Satisfaction']
df_2016.head()

Unnamed: 0,Age,Gender,Experience,Compensation,Tech,Satisfaction
0,20-24,Male,,,,
1,30-34,Male,6 - 10 years,"$40,000 - $50,000",iOS; Objective-C,I love my job
2,,,,,,
3,,Female,11+ years,"Less than $10,000",,I love my job
4,> 60,Prefer not to disclose,,,,


In [29]:
df_2016 = pd.concat([df_2016, df_2016.Tech.str.get_dummies(sep='; ')], axis=1)
df_2016 = df_2016.drop(columns=['Tech'])
df_2016.head()

Unnamed: 0,Age,Gender,Experience,Compensation,Satisfaction,Android,AngularJS,Arduino / Raspberry Pi,C,C#,...,SQL Server,Salesforce,Scala,SharePoint,Spark,Swift,Visual Basic,Windows Phone,WordPress,iOS
0,20-24,Male,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,30-34,Male,6 - 10 years,"$40,000 - $50,000",I love my job,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,,,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,Female,11+ years,"Less than $10,000",I love my job,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,> 60,Prefer not to disclose,,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df_2016.describe()

Unnamed: 0,Android,AngularJS,Arduino / Raspberry Pi,C,C#,C++,Cassandra,Clojure,"Cloud (AWS, GAE, Azure, etc.)",CoffeeScript,...,SQL Server,Salesforce,Scala,SharePoint,Spark,Swift,Visual Basic,Windows Phone,WordPress,iOS
count,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,...,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0
mean,0.153507,0.157469,0.067767,0.137034,0.272765,0.17114,0.011833,0.009923,0.082616,0.029663,...,0.16609,0.011869,0.028592,0.015224,0.012618,0.049009,0.059111,0.020043,0.080243,0.080278
std,0.360479,0.364246,0.251348,0.343886,0.445385,0.376635,0.108135,0.099121,0.275304,0.169656,...,0.372164,0.108296,0.166658,0.122444,0.111621,0.21589,0.235835,0.140148,0.271671,0.271726
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Read and process 2017 data
