In [41]:
import pandas as pd
import numpy as np
import matplotlib_inline as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

### Prokgect Objective
Predicting annual salary in dollars for data science positions based on various data such as role definition, location, company size, and employee location

### Reading Data From Source

In [42]:
data = pd.read_csv('SalarayDataSet/ds_salaries.csv')
data.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


### Describing Features

**work_year**: The year the salary was paid.
**experience_level**: The experience level in the job during the year
**employment_type**: The type of employment for the role
**job_title**: The role worked in during the year.
**salary**: The total gross salary amount paid.
**salary_currency**: The currency of the salary paid as an ISO 4217 currency code.
**salary_in_usd**: The salary in USD
**employee_residence**: Employee's primary country of residence in during the work year as an ISO 3166 country code.
**remote_ratio**: The overall amount of work done remotely
**company_location**: The country of the employer's main office or contracting branch
**company_size**: The median number of people that worked for the company during the year

### Data Cleaning

In [43]:
# No Nan values so no action should be done
data.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

### Preprocessing
Converting categorical features to numerical using encoding methods

In [44]:
# let's look which features are not numerical and should be converted
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


We can observe that the data set has 7 not numerical features which should be converted:
 experience_level, employment_type, job_title, salary_currency, employee_residence, company_location, and company_size

In [47]:
label_encoder = LabelEncoder()

# Creating a list of all categorical columns
columns_to_encode = [column for column in data.columns if data[column].dtype == 'object']

for column in columns_to_encode:
    data[column] = label_encoder.fit_transform(data[column])

In [52]:
for column in columns_to_encode:
    print(column,
          ", ",data[column].nunique())

experience_level ,  4
employment_type ,  4
job_title ,  93
salary_currency ,  20
employee_residence ,  78
company_location ,  72
company_size ,  3


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   work_year           3755 non-null   int64
 1   experience_level    3755 non-null   int32
 2   employment_type     3755 non-null   int32
 3   job_title           3755 non-null   int32
 4   salary              3755 non-null   int64
 5   salary_currency     3755 non-null   int32
 6   salary_in_usd       3755 non-null   int64
 7   employee_residence  3755 non-null   int32
 8   remote_ratio        3755 non-null   int64
 9   company_location    3755 non-null   int32
 10  company_size        3755 non-null   int32
dtypes: int32(7), int64(4)
memory usage: 220.1 KB
