In [3]:
import numpy as np 
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

In [4]:
df = df = pd.read_csv("C:Documents\ds_salaries.csv")

In [5]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
5,5,2020,EN,FT,Data Analyst,72000,USD,72000,US,100,US,L
6,6,2020,SE,FT,Lead Data Scientist,190000,USD,190000,US,100,US,S
7,7,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
8,8,2020,MI,FT,Business Data Analyst,135000,USD,135000,US,100,US,L
9,9,2020,SE,FT,Lead Data Engineer,125000,USD,125000,NZ,50,NZ,S


Dataset Dictionary
Dataset

work_year - The year the salary was paid.

experience_level - The experience level in the job during the year with the following possible values:
EN = Entry-level / Junior
MI = Mid-level / Intermediate
SE = Senior-level / Expert
EX = Executive-level / Director

employment_type - The type of employement for the role:
PT = Part-time
FT = Full-time
CT = Contract
FL = Freelance

job_title - The role worked in during the year.

salary - The total gross salary amount paid.

salary_currency - The currency of the salary paid as an ISO 4217 currency code.

salary_in_usd - The salary in USD (FX rate divided by avg. USD rate for the respective year via fxdata.foorilla.com).

employee_residence - Employee's primary country of residence in during the work year as an ISO 3166 country code(Alpha-2 code).

remote_ratio - The overall amount of work done remotely, possible values are as follows:
0 = No remote work (less than 20%)
50 = Partially remote
100 = Fully remote (more than 80%)

company_location - The country of the employer's main office or contracting branch as an ISO 3166 country code(Alpha-2 code).

company_size - The average number of people that worked for the company during the year:
S = less than 50 employees (small)
M = 50 to 250 employees (medium)
L = more than 250 employees (large)

Knowing the Dataset

In [6]:
df.dtypes

Unnamed: 0             int64
work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [7]:
#checking for null values
df.isnull().sum()

Unnamed: 0            0
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [8]:
#checking for unique values
df.nunique()

Unnamed: 0            607
work_year               3
experience_level        4
employment_type         4
job_title              50
salary                272
salary_currency        17
salary_in_usd         369
employee_residence     57
remote_ratio            3
company_location       50
company_size            3
dtype: int64

In [9]:
#Replacing some of the values to understand the graphs clearly
df.remote_ratio.replace([100,50,0], ['Remote', 'Hybrid' ,'On-site'],inplace = True)
df.experience_level.replace(['EN','MI','SE', 'EX'], ['Entry', 'Mid', 'Senior', 'Executive'], inplace = True)

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,Mid,FT,Data Scientist,70000,EUR,79833,DE,On-site,DE,L
1,1,2020,Senior,FT,Machine Learning Scientist,260000,USD,260000,JP,On-site,JP,S
2,2,2020,Senior,FT,Big Data Engineer,85000,GBP,109024,GB,Hybrid,GB,M
3,3,2020,Mid,FT,Product Data Analyst,20000,USD,20000,HN,On-site,HN,S
4,4,2020,Senior,FT,Machine Learning Engineer,150000,USD,150000,US,Hybrid,US,L


                                EXPLORATORY DATA ANALYSIS

In [12]:
#Checking the distribution of employment_type
px.histogram(df, x = 'employment_type',histnorm = 'percent', text_auto = '.2f',template = 'ggplot2',
             title = 'Precentage of Employment Types')


most of the employees have full time job

In [13]:
#checking the distribution of remote employees in different years
px.histogram(df, x = 'remote_ratio',color = 'work_year', barmode = 'group',color_discrete_sequence=px.colors.qualitative.Pastel,
             template = 'ggplot2',title='Count of each Work Type')


the number of remote workers are more in 2022

In [17]:
#checking the distribution of employee's residence
px.histogram(df, x = df.employee_residence.sort_values(),histnorm = 'percent', text_auto = '.1f',
             labels={
                 "x": "employee_ressidence",
                 "percent": "percent(%)"
             },
             color_discrete_sequence=px.colors.qualitative.Safe, template = 'ggplot2', 
             title='Count of Employee residence in each country')

most of the employees are from United States, United Kingdom, India, and Canada

In [19]:
#checking the distribution of company's location
px.histogram(df, x = df.company_location.sort_values(),histnorm = 'percent', text_auto = '.1f',
             labels={ "x": "company_location",
                 "percent": "percent(%)"
                 
             },
             color_discrete_sequence=px.colors.qualitative.Safe, template = 'ggplot2',
             title='Count of Company Location in each country')

most of the companies are located in United State, United Kingdom, Canada, Germany and India

In [20]:
#checking the distribution of different job titles and theexperince level
px.histogram(df, x=df.job_title.sort_values(), color = 'experience_level', height = 800, barmode = 'group',
             labels={"x": "job_title",
                 "y": "count"
                 
             },
             color_discrete_sequence=px.colors.qualitative.Dark24, template = 'ggplot2',
             text_auto  = True, title = 'Count of number of people with all experience levels in each job')

* Employees with title 'Data Scientist' is high.
* Very less jobs for Entry and Executive level.
* Most of the jobs are for Senior and Mid level in almost Job Titles

In [21]:
#checking the distribution of employees across company location and conutry
px.scatter(df, x=df.employee_residence.sort_values(), y = df.company_location.sort_values(), color = 'remote_ratio',
           labels ={"x":'Employee Residence', "y":'Company Location', "remote_ratio":'Work Type'},
           color_discrete_sequence=px.colors.qualitative.Light24, template = 'ggplot2',
           title = 'Company Location VS Employee Residence for type of work(Remote, Hybrid or On-site)')

In [22]:
#checking the distribution of salaries across different experince levels
px.scatter(df, x = 'salary_in_usd', y = 'experience_level', size = 'salary_in_usd', hover_name = 'job_title',
           color = 'job_title', 
           color_discrete_sequence=px.colors.qualitative.Alphabet, template = 'ggplot2',
           animation_frame = 'work_year', 
           title = 'Experience level VS Salary').update_yaxes(categoryarray = ['Entry', 'Mid', 'Senior', 'Executive'])

Data scientists earn th most followed by Data Engineer and BI Data Analyst