In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Importing Database

In [86]:
data = pd.read_csv('ds_salaries1.csv')

## Feature Selection

In [87]:
data = data.drop(['employee_residence'], axis=1)

## Extracting company_location Feature

In [88]:
def add_company_location(region):
    if region=='US' or region=='GB' or region=='CA' or region=='ES' or region=='DE' or region=='IN' or region=='FR':
        return region
    else:
        return 'Other'

data['company_location'] = data['company_location'].apply(add_company_location)

## Extracting job_title Feature

In [89]:
data['job_category']='Others'

def add_job_titles():
    category_mapping = {
        'Data Scientist': 'Data Scientist',
        'Data Science' : 'Data Scientist',
        'Data Engineer': 'Data Engineer',
        'Data Architect': 'Data Engineer',
        'Big Data Architect': 'Data Engineer',
        'Data Analyst': 'Data Analyst',
        'Head of Data': 'Data Analyst',
        'Data Analytic': 'Data Analyst',
        'Data Specialist': 'Data Analyst',
        'ETL Developer': 'Data Analyst',
        'Machine Learning': 'ML Engineer',
        'ML' : 'ML Engineer',
        'NLP Engineer' : 'ML Engineer',
        'AI': 'AI Engineer',
        'Business': 'BI/BA Engineer',
        'BI': 'BI/BA Engineer',
        'BA': 'BI/BA Engineer',
        'Cloud': 'Cloud Engineer',
        'Deep Learning': 'Deep Learning Engineer'
    }

    for keyword, category in category_mapping.items():
        data.loc[data['job_title'].str.contains(keyword, case=False), 'job_category'] = category


add_job_titles()

In [90]:
data = data.drop(['job_title'], axis=1)

## Extracting salary_currency Feature

In [91]:
def add_salary_currency(currency):
    if currency=='USD' or currency=='GBP' or currency=='EUR' or currency=='INR' or currency=='CAD' or currency=='AUD':
        return currency
    else:
        return 'Other'

data['salary_currency'] = data['salary_currency'].apply(add_salary_currency)

## One-hot Encoding

In [92]:
data = pd.get_dummies(data, columns=['work_year'], prefix='work_year')
# data = pd.get_dummies(data, columns=['experience_level'], prefix='experience_level')
# data = pd.get_dummies(data, columns=['employment_type'], prefix='employment_type')
data = pd.get_dummies(data, columns=['job_category'], prefix='job_category')
data = pd.get_dummies(data, columns=['salary_currency'], prefix='salary_currency')
# data = pd.get_dummies(data, columns=['remote_ratio'], prefix='remote_ratio')
# data = pd.get_dummies(data, columns=['company_size'], prefix='company_size')
data = pd.get_dummies(data, columns=['company_location'], prefix='company_location')

## Label Encoding

In [93]:
data['company_size'] = data['company_size'].replace({'S':1 ,'M': 2, 'L': 3})
data['remote_ratio'] = data['remote_ratio'].replace({100: 3, 50: 2, 0: 1}) 
data['experience_level'] = data['experience_level'].replace({'EN': 1, 'MI': 2, 'SE': 3, 'EX': 4})
data['employment_type'] = data['employment_type'].replace({'FL':1 ,'PT': 2, 'FT': 3, 'CT': 4,})

In [94]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()

# data['experience_level'] = le.fit_transform(data['experience_level'])
# data['work_year'] = le.fit_transform(data['work_year'])
# data['employment_type'] = le.fit_transform(data['employment_type'])
# data['job_title'] = le.fit_transform(data['job_title'])
# data['company_location'] = le.fit_transform(data['company_location'])
# data['company_size'] = le.fit_transform(data['company_size'])
# data['remote_ratio'] = le.fit_transform(data['remote_ratio'])

In [95]:
data.shape

(607, 33)

In [96]:
data.tail()

Unnamed: 0.1,Unnamed: 0,experience_level,employment_type,salary,salary_in_usd,remote_ratio,company_size,work_year_2020,work_year_2021,work_year_2022,...,salary_currency_Other,salary_currency_USD,company_location_CA,company_location_DE,company_location_ES,company_location_FR,company_location_GB,company_location_IN,company_location_Other,company_location_US
602,602,3,3,154000,154000,3,2,0,0,1,...,0,1,0,0,0,0,0,0,0,1
603,603,3,3,126000,126000,3,2,0,0,1,...,0,1,0,0,0,0,0,0,0,1
604,604,3,3,129000,129000,1,2,0,0,1,...,0,1,0,0,0,0,0,0,0,1
605,605,3,3,150000,150000,3,2,0,0,1,...,0,1,0,0,0,0,0,0,0,1
606,606,2,3,200000,200000,3,3,0,0,1,...,0,1,0,0,0,0,0,0,0,1


## Test-train Splitting

In [97]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()

data['salary_in_usd']=np.log(data['salary_in_usd'].values)

y = data[['salary_in_usd']]
x = scalar.fit_transform(data.drop(['salary_in_usd'],axis=1))

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)

## Linear Regression

In [98]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()
model1.fit(x_train, y_train)

In [99]:
from sklearn import metrics

y_pred1 = model1.predict(x_test)
accuracy1 = metrics.r2_score(y_test,y_pred1)
print(accuracy1)

0.5242025744964938


In [104]:
comparison_df = pd.DataFrame({
    'Actual Salary': y_test['salary_in_usd'].values,
    'Predicted Salary': y_pred1.flatten()
})

comparison_df.tail(60)

Unnamed: 0,Actual Salary,Predicted Salary
62,8.59582,9.527864
63,12.928779,12.069389
64,12.013701,11.751658
65,12.239019,11.644428
66,11.457032,11.699441
67,11.088858,10.979801
68,11.77529,11.550081
69,10.901137,11.309465
70,10.915088,11.081542
71,11.714608,10.464518


## Decision Tree Regressor

In [101]:
from sklearn.tree import DecisionTreeRegressor

model2 = DecisionTreeRegressor()
model2.fit(x_train, y_train)

In [102]:
y_pred2 = model2.predict(x_test)
accuracy2 = metrics.r2_score(y_test,y_pred2)
print(accuracy2)

0.7753491351349885


In [103]:
comparison_df1 = pd.DataFrame({
    'Actual Salary': y_test['salary_in_usd'].values,
    'Predicted Salary': y_pred2.flatten()
})

comparison_df1.tail(60)

Unnamed: 0,Actual Salary,Predicted Salary
62,8.59582,8.649449
63,12.928779,12.938441
64,12.013701,12.013701
65,12.239019,12.232228
66,11.457032,11.415687
67,11.088858,11.088858
68,11.77529,11.77529
69,10.901137,11.279694
70,10.915088,11.158234
71,11.714608,10.256817
