In [138]:
import pandas as pd
import numpy as np
import seaborn as sns

In [145]:
salaries = pd.read_csv('data/salaries.csv')

## Let's clean up this dataset gooood


In [155]:
salaries = salaries.drop(['Name', 'EmpID', 'Date_of_Birth', 'Join_Date', 'Net_Pay', 'Deduction', 'Deduction_percentage', 'Designation'], axis='columns')

In [158]:
salaries['Tenure (months)'] = salaries['Tenure_in_org_in_months']
salaries = salaries.drop('Tenure_in_org_in_months', axis='columns')

In [161]:
salaries['Gross'] = salaries['GROSS']
salaries = salaries.drop('GROSS', axis='columns')

In [162]:
salaries

Unnamed: 0,Gender,Age,Department,Years_exp,Tenure (months),Gross
0,0,25,Tech,2.333333,7,74922
1,1,26,Operations,2.666667,6,44375
2,0,24,Operations,2.000000,8,82263
3,0,26,Operations,2.666667,6,44375
4,0,29,Engineering,3.666667,25,235405
...,...,...,...,...,...,...
1797,0,29,Other,3.666667,34,88934
1798,0,27,Engineering,3.000000,33,133224
1799,0,29,Operations,3.666667,15,72547
1800,0,47,Other,9.666667,30,227176


In [147]:
# that's a lot of departments, let's see if we can custom map it
def map_dept(dept):
    if 'media' in dept.lower():
        return 'Media'
    elif 'sales' in dept.lower():
        return 'Sales'
    elif 'it' in dept.lower() or 'tech' in dept.lower() or 'web' in dept.lower() or 'app' in dept.lower():
        return 'Tech'
    elif 'finance' in dept.lower():
        return 'Finance'
    elif 'operations' in dept.lower() or 'admin' in dept.lower():
        return 'Operations'
    elif 'engineer' in dept.lower():
        return 'Engineering'
    elif 'product' in dept.lower() or 'platform' in dept.lower():
        return 'Product'
    else:
        return 'Other'

In [148]:
salaries.Department = salaries.Department.map(lambda x: map_dept(x))

In [150]:
# adding a random feature for more playroom
salaries['Years_exp'] = salaries.Age / np.random.randint(2, 5) - np.random.randint(3, 10)

In [151]:
# mapping gender to 0/1
salaries.Gender = salaries.Gender.map(lambda x: 1 if x == "F" else 0)

## Let's also encode the departments (since Linear model won't take non-numerics)

In [166]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(salaries['Department'])

salaries['Department_code'] = le.transform(salaries['Department'])

salaries.head()

Unnamed: 0,Gender,Age,Department,Years_exp,Tenure (months),Gross,Department_code
0,0,25,Tech,2.333333,7,74922,7
1,1,26,Operations,2.666667,6,44375,3
2,0,24,Operations,2.0,8,82263,3
3,0,26,Operations,2.666667,6,44375,3
4,0,29,Engineering,3.666667,25,235405,0


In [170]:
### Now then, let's rearrange those so that Gross is the last column
salaries.columns.values
salaries = salaries[['Gender', 'Age', 'Department', 'Department_code', 'Years_exp', 'Tenure (months)',
       'Gross']]

In [172]:
### Time to save it into a clean csv ###
salaries.to_csv('clean_salaries.csv', index=False)

BTW

In [None]:
# gender salary imbalance :( doesn't need ML
salaries.groupby('Gender').mean()['Gross']

## Let's do some linear regression on salary

In [179]:
x = salaries.drop(['Gross', 'Department'], axis='columns')
y = salaries.Gross

In [180]:
from sklearn.model_selection import train_test_split

In [181]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2)

In [182]:
from sklearn.linear_model import LinearRegression

In [183]:
mod1 = LinearRegression()

In [184]:
mod1.fit(xtrain, ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [185]:
mod1.score(xtest, ytest)

0.4780448094241332

In [187]:
xtest

Unnamed: 0,Gender,Age,Department_code,Years_exp,Tenure (months)
672,1,31,6,4.333333,34
1443,0,29,7,3.666667,24
1096,0,26,2,2.666667,10
1360,0,35,4,5.666667,35
1039,1,26,2,2.666667,41
...,...,...,...,...,...
1272,1,26,4,2.666667,33
675,1,29,4,3.666667,28
549,0,32,1,4.666667,28
1523,0,25,4,2.333333,36


In [186]:
mod1.coef_

array([-26661.82220334,  10554.59559246,   2026.09679389,   3518.19853082,
          229.89016309])

In [194]:
person0 = [[0, 25, 7, 4.2, 30]]
mod1.predict(person0)

array([114220.29842437])