Lets import the modules and data we need.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
from collections import defaultdict
#import util_functions as uf
from util_functions import *
%matplotlib inline

df = pd.read_csv('./2019/survey_results_public.csv')
schema = pd.read_csv('./2019/survey_results_schema.csv')
df.shape

(88883, 85)

Lets see if we can predict if someone describes their occupation as a Data Scientist/Machine Learning Specialist based on some of the colums we previously dug into during our research of the data.  These may or may not be the best columns to select; however, for simplicity and ease of training our model we will use them going forward.


In [2]:
cols = ['EdLevel', 'UndergradMajor', 'Age', 'Hobbyist', 'DevType', 'WorkWeekHrs', 'WorkRemote', 'BetterLife']

df = df[cols]
df.shape

(88883, 8)

We need to do some data manipulation in order to utilize the dataframe in a machine learning algorithm:

X - A matrix holding all of the variables we want to consider when predicting the response
y - the corresponding response vector


1. Drop all the rows with no salaries
2. Create X as all the columns that are not the Salary column
3. Create y as the Salary column
4. Drop the Salary, Respondent, and the ExpectedSalary columns
5. For each numeric variable, fill the column with the mean value.
6. Create dummy columns for all the categorical variables, drop the original columns

In [3]:

# Drop rows with missing DevType values
df = df[df.DevType.notnull()]

# Fill numeric columns with the mean
num_vars = df.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
    df[col].fillna((df[col].mean()), inplace=True)

# Lets set the y values to any that have 'Data scientist or machine learning specialist'
# in the string to a value of 1 and all others a value of 0
y = df['DevType']

for i, j in y.items():
    if y[i].find('Data scientist or machine learning specialist') != -1:
        y.at[i] = '1'       
    else: 
        y.at[i] = '0'

# Convert the series to integers
y = y.astype('int32')

# Lets get our X matrix by dropping the DevType column
X = df.drop(['DevType'], axis=1)

# Dummy the categorical variables
cat_vars = X.select_dtypes(include=['object']).copy().columns
for var in  cat_vars:
    # for each cat add dummy var, drop original column
    X = pd.concat([X.drop(var, axis=1), pd.get_dummies(X[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)

# Make sure the number of rows that are related to a DevType of 'Data scientist or machine learning specialist' are 6460
y.sum()

6460

In [4]:
X.shape

(81335, 29)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(56934, 29) (24401, 29) (56934,) (24401,)


In [6]:
#lm_model = LinearRegression(normalize=True) # Here you could set any hyperparameters of your model
dt_model = DecisionTreeClassifier(max_depth=5) # Here you could set any hyperparameters of your model
dt_model.fit(X_train, y_train) # If this model was to predict for new individuals, we probably would want
               # worry about train/test splits and cross-validation, but for now I am most 
               # interested in finding a model that just fits all of the data well
y_test_preds = dt_model.predict(X_test) #We can then use our fitted model to predict the salary for each
                                        #indvidual in our test set, and see how well these predictions
                                        #match the truth.

print(dt_model.score(X_test,y_test))
print(mean_squared_error(y_test, y_test_preds)) #metrics to assess fit include Rsquared and MSE.

0.920986844801
0.0790131551986


In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=2), n_estimators = 100) # Here you could set any hyperparameters of your model
ada_model.fit(X_train, y_train) # If this model was to predict for new individuals, we probably would want
               # worry about train/test splits and cross-validation, but for now I am most 
               # interested in finding a model that just fits all of the data well
y_test_preds = ada_model.predict(X_test) #We can then use our fitted model to predict the salary for each
                                        #indvidual in our test set, and see how well these predictions
                                        #match the truth.

print(ada_model.score(X_test,y_test))
print(mean_squared_error(y_test, y_test_preds)) #metrics to assess fit include Rsquared and MSE.

0.921970411049
0.0780295889513


In [None]:
from sklearn.svm import SVC
svc_model = SVC() # Here you could set any hyperparameters of your model
svc_model.fit(X_train, y_train) # If this model was to predict for new individuals, we probably would want
               # worry about train/test splits and cross-validation, but for now I am most 
               # interested in finding a model that just fits all of the data well
y_test_preds = svc_model.predict(X_test) #We can then use our fitted model to predict the salary for each
                                        #indvidual in our test set, and see how well these predictions
                                        #match the truth.

print(svc_model.score(X_test,y_test))
print(mean_squared_error(y_test, y_test_preds)) #metrics to assess fit include Rsquared and MSE.