In [349]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Importing Database

In [350]:
data = pd.read_csv('ds_salaries1.csv')

## Feature Selection

In [351]:
data = data.drop(['employee_residence','salary_in_usd'], axis=1)

## Extracting company_location Feature

In [352]:
def add_company_location(region):
    if region=='US' or region=='GB' or region=='CA' or region=='ES' or region=='DE' or region=='IN' or region=='FR':
        return region
    else:
        return 'Other'

data['company_location'] = data['company_location'].apply(add_company_location)

## Extracting job_title Feature

In [353]:
data['job_category']='Others'

def add_job_titles():
    category_mapping = {
        'Data Scientist': 'Data Scientist',
        'Data Science' : 'Data Scientist',
        'Data Engineer': 'Data Engineer',
        'Data Architect': 'Data Engineer',
        'Big Data Architect': 'Data Engineer',
        'Data Analyst': 'Data Analyst',
        'Head of Data': 'Data Analyst',
        'Data Analytic': 'Data Analyst',
        'Data Specialist': 'Data Analyst',
        'ETL Developer': 'Data Analyst',
        'Machine Learning': 'ML Engineer',
        'ML' : 'ML Engineer',
        'NLP Engineer' : 'ML Engineer',
        'AI': 'AI Engineer',
        'Business': 'BI/BA Engineer',
        'BI': 'BI/BA Engineer',
        'BA': 'BI/BA Engineer',
        'Cloud': 'Cloud Engineer',
        'Deep Learning': 'Deep Learning Engineer',
    }

    for keyword, category in category_mapping.items():
        data.loc[data['job_title'].str.contains(keyword, case=False), 'job_category'] = category


add_job_titles()

In [354]:
data = data.drop(['job_title'], axis=1)

## Extracting salary_currency Feature

In [355]:
def add_salary_currency(currency):
    if currency=='USD' or currency=='GBP' or currency=='EUR' or currency=='INR' or currency=='CAD' or currency=='AUD':
        return currency
    else:
        return 'Other'

data['salary_currency'] = data['salary_currency'].apply(add_salary_currency)

## One-hot Encoding

In [356]:
data = pd.get_dummies(data, columns=['work_year'], prefix='work_year')
# data = pd.get_dummies(data, columns=['experience_level'], prefix='experience_level')
# data = pd.get_dummies(data, columns=['employment_type'], prefix='employment_type')
data = pd.get_dummies(data, columns=['job_category'], prefix='job_category')
data = pd.get_dummies(data, columns=['salary_currency'], prefix='salary_currency')
# data = pd.get_dummies(data, columns=['remote_ratio'], prefix='remote_ratio')
# data = pd.get_dummies(data, columns=['company_size'], prefix='company_size')
data = pd.get_dummies(data, columns=['company_location'], prefix='company_location')

## Label Encoding

In [357]:
data['company_size'] = data['company_size'].replace({'S':1 ,'M': 2, 'L': 3})
data['remote_ratio'] = data['remote_ratio'].replace({100: 3, 50: 2, 0: 1}) 
data['experience_level'] = data['experience_level'].replace({'EN': 1, 'MI': 2, 'SE': 3, 'EX': 4})
data['employment_type'] = data['employment_type'].replace({'FL':1 ,'PT': 2, 'FT': 3, 'CT': 4,})

In [358]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()

# data['experience_level'] = le.fit_transform(data['experience_level'])
# data['work_year'] = le.fit_transform(data['work_year'])
# data['employment_type'] = le.fit_transform(data['employment_type'])
# data['job_title'] = le.fit_transform(data['job_title'])
# data['company_location'] = le.fit_transform(data['company_location'])
# data['company_size'] = le.fit_transform(data['company_size'])
# data['remote_ratio'] = le.fit_transform(data['remote_ratio'])

In [359]:
data.shape

(607, 32)

In [360]:
data.tail()

Unnamed: 0.1,Unnamed: 0,experience_level,employment_type,salary,remote_ratio,company_size,work_year_2020,work_year_2021,work_year_2022,job_category_AI Engineer,...,salary_currency_Other,salary_currency_USD,company_location_CA,company_location_DE,company_location_ES,company_location_FR,company_location_GB,company_location_IN,company_location_Other,company_location_US
602,602,3,3,154000,3,2,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
603,603,3,3,126000,3,2,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
604,604,3,3,129000,1,2,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
605,605,3,3,150000,3,2,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
606,606,2,3,200000,3,3,0,0,1,1,...,0,1,0,0,0,0,0,0,0,1


## Test-train Splitting

In [361]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()

data['salary']=np.log(data['salary'].values)

y = data[['salary']]
x = scalar.fit_transform(data.drop(['salary'],axis=1))
x1 = data.drop(['salary'], axis=1)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [362]:
x1

Unnamed: 0.1,Unnamed: 0,experience_level,employment_type,remote_ratio,company_size,work_year_2020,work_year_2021,work_year_2022,job_category_AI Engineer,job_category_BI/BA Engineer,...,salary_currency_Other,salary_currency_USD,company_location_CA,company_location_DE,company_location_ES,company_location_FR,company_location_GB,company_location_IN,company_location_Other,company_location_US
0,0,2,3,1,3,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,3,3,1,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,2,3,3,2,2,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,3,2,3,1,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,4,3,3,2,3,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,3,3,3,2,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
603,603,3,3,3,2,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
604,604,3,3,1,2,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
605,605,3,3,3,2,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1


In [363]:
x1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Unnamed: 0                   607 non-null    int64
 1   experience_level             607 non-null    int64
 2   employment_type              607 non-null    int64
 3   remote_ratio                 607 non-null    int64
 4   company_size                 607 non-null    int64
 5   work_year_2020               607 non-null    uint8
 6   work_year_2021               607 non-null    uint8
 7   work_year_2022               607 non-null    uint8
 8   job_category_AI Engineer     607 non-null    uint8
 9   job_category_BI/BA Engineer  607 non-null    uint8
 10  job_category_Cloud Engineer  607 non-null    uint8
 11  job_category_Data Analyst    607 non-null    uint8
 12  job_category_Data Engineer   607 non-null    uint8
 13  job_category_Data Scientist  607 non-null    uint8

## Linear Regression

In [364]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()
model1.fit(x_train, y_train)

In [365]:
from sklearn import metrics

y_pred1 = model1.predict(x_test)
accuracy1 = metrics.r2_score(y_test,y_pred1)
print(accuracy1)

0.5689093305885731


In [366]:
# import pickle

# with open("model.pickle","wb") as file:
#     pickle.dump(model2, file)

In [367]:
import joblib

# Save the model using joblib
joblib.dump(model1, 'model.pickle')

['model.pickle']

In [368]:
comparison_df = pd.DataFrame({
    'Actual Salary': y_test['salary'].values,
    'Predicted Salary': y_pred1.flatten()
})

comparison_df.tail(60)

Unnamed: 0,Actual Salary,Predicted Salary
62,12.89922,13.893555
63,12.928779,12.116699
64,12.013701,11.738281
65,12.239019,11.638428
66,11.289782,11.568604
67,10.819778,10.655762
68,11.77529,11.565674
69,11.127263,11.527832
70,10.915088,10.803955
71,11.652687,13.08667


## Decision Tree Regressor

In [369]:
from sklearn.tree import DecisionTreeRegressor

model2 = DecisionTreeRegressor()
model2.fit(x_train, y_train)

In [370]:
y_pred2 = model2.predict(x_test)
accuracy2 = metrics.r2_score(y_test,y_pred2)
print(accuracy2)

0.4917914011664899


In [371]:
comparison_df1 = pd.DataFrame({
    'Actual Salary': y_test['salary'].values,
    'Predicted Salary': y_pred2.flatten()
})

comparison_df1.tail(60)

Unnamed: 0,Actual Salary,Predicted Salary
62,12.89922,14.285514
63,12.928779,11.971615
64,12.013701,11.849398
65,12.239019,11.32539
66,11.289782,11.112448
67,10.819778,10.463103
68,11.77529,11.023535
69,11.127263,11.461632
70,10.915088,11.156251
71,11.652687,11.532728
