In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

### Logistic regression to classify salaries about a threshold
In our linear regression analysis, we find that there is a weak relationship between average salary and our predictor X variables. Thus, to reduce modeling difficulty, we attempt to classify salaries based on a certain threshold. From the EDA, we find that the median average salary is 97,000 USD, so we choose 100,000 as a reasonable classification threshold.

### Process data
We need to create dummy variables and delete unwanted columns. We also normalize the quantitative variables.

In [2]:
# read data
df = pd.read_csv('data/data_clean.csv')
df.head()

Unnamed: 0,avg_salary_k,rating,size,job_location,age,python,visual_software,ML_software,spark,aws,excel,sql,sas,hadoop,degree,in_CA,senior_status
0,72.0,3.8,501 - 1000,NM,48,1,1,0,0,0,1,0,1,0,MS,False,False
1,87.5,3.4,10000+,MD,37,1,0,0,0,0,0,0,0,0,MS,False,False
2,85.0,4.8,501 - 1000,FL,11,1,0,0,1,0,1,1,1,0,MS,False,False
3,76.5,3.8,1001 - 5000,WA,56,1,0,0,0,0,0,0,0,0,Other,False,False
4,114.5,2.9,51 - 200,NY,23,1,0,0,0,0,1,1,1,0,Other,False,False


In [3]:
# Create dummy variables for average salary
df['above_100k'] = df['avg_salary_k'] >= 100
df['above_100k'] = df['above_100k'].astype('int')
# Create dummy variables for degree
df['degree_other'] = pd.get_dummies(df['degree'])['Other']
df['degree_phd'] = pd.get_dummies(df['degree'])['PHD']
# Create dummy for inCA
df['in_CA'] = df['in_CA'].astype('int')
# Create dummy for senior_status
df['senior_status'] = df['senior_status'].astype('int')

In [4]:
# drop unneeded columns
df = df.drop(columns=['avg_salary_k', 'job_location', 'size', 'degree'])


In [5]:
# check work
df.head()

Unnamed: 0,rating,age,python,visual_software,ML_software,spark,aws,excel,sql,sas,hadoop,in_CA,senior_status,above_100k,degree_other,degree_phd
0,3.8,48,1,1,0,0,0,1,0,1,0,0,0,0,0,0
1,3.4,37,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4.8,11,1,0,0,1,0,1,1,1,0,0,0,0,0,0
3,3.8,56,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4,2.9,23,1,0,0,0,0,1,1,1,0,0,0,1,1,0


In [6]:
# normalize quantitative features
df['rating'] = (df['rating'] - df['rating'].mean()) / df['rating'].std()
df['age'] = (df['age'] - df['age'].mean()) / df['age'].std()

In [7]:
# final check
df.head()

Unnamed: 0,rating,age,python,visual_software,ML_software,spark,aws,excel,sql,sas,hadoop,in_CA,senior_status,above_100k,degree_other,degree_phd
0,0.118576,-0.023746,1,1,0,0,0,1,0,1,0,0,0,0,0,0
1,-0.594528,-0.232038,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.901337,-0.724363,1,0,0,1,0,1,1,1,0,0,0,0,0,0
3,0.118576,0.127739,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4,-1.485909,-0.497136,1,0,0,0,0,1,1,1,0,0,0,1,1,0


### Fit model on training data
We randomly split the data into training (70%) and testing (30%) data and fit a logistic regression model on the training set.

In [8]:
X = df.loc[:, df.columns != 'above_100k']
y = df['above_100k']

In [9]:
# check work
X.head()

Unnamed: 0,rating,age,python,visual_software,ML_software,spark,aws,excel,sql,sas,hadoop,in_CA,senior_status,degree_other,degree_phd
0,0.118576,-0.023746,1,1,0,0,0,1,0,1,0,0,0,0,0
1,-0.594528,-0.232038,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1.901337,-0.724363,1,0,0,1,0,1,1,1,0,0,0,0,0
3,0.118576,0.127739,1,0,0,0,0,0,0,0,0,0,0,1,0
4,-1.485909,-0.497136,1,0,0,0,0,1,1,1,0,0,0,1,0


In [10]:
y.head()

0    0
1    0
2    0
3    0
4    1
Name: above_100k, dtype: int32

In [11]:
# conduct train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [12]:
# fit logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)


# coefficients
intercept = model.intercept_
coefficients = model.coef_

### Model performance on the training data

In [15]:
train_predict = model.predict(X_train)
tn, fp, fn, tp = confusion_matrix(train_predict, y_train).ravel()
# false postive and false negative rates on training data
fpr = fp / (fp + tn) # false positive rate
fnr = fn / (fn + tp) # false negative rate
(fpr, fnr)

(0.24736842105263157, 0.24778761061946902)

In [16]:
# overall accuracy on training data
# overall accuracy
accuracy = (tp+tn) / (tp+tn+fp+fn)
accuracy

0.7524752475247525

### Model Validation
We validate the model using the test data and compare with performance on the training data to test for overfitting

In [13]:
predictions = model.predict(X_test)
cm = confusion_matrix(y_pred=predictions, y_true=y_test)
cm

array([[47, 13],
       [27, 43]], dtype=int64)

In [15]:
tn, fp, fn, tp = confusion_matrix(predictions, y_test).ravel()
(tn, fp, fn, tp)

(47, 27, 13, 43)

In [16]:
# false postive and false negative rates
fpr = fp / (fp + tn) # false positive rate
fnr = fn / (fn + tp) # false negative rate
(fpr, fnr)

(0.36486486486486486, 0.23214285714285715)

In [17]:
# overall accuracy
accuracy = (tp+tn) / (tp+tn+fp+fn)
accuracy

0.6923076923076923

### Fit final model on all data

In [18]:
model_final = LogisticRegression()
model_final.fit(X, y)

# final accuracy
model_final.score(X, y)


0.7390300230946882