In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset from a CSV file into a Pandas DataFrame
df = pd.read_csv("ds_salaries.csv")


In [3]:
# Display the first few rows of the dataset for an initial glimpse
df.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [4]:
# Generate summary statistics of the dataset
df.describe()

Unnamed: 0.1,Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,607.0,607.0,607.0,607.0,607.0
mean,303.0,2021.405272,324000.1,112297.869852,70.92257
std,175.370085,0.692133,1544357.0,70957.259411,40.70913
min,0.0,2020.0,4000.0,2859.0,0.0
25%,151.5,2021.0,70000.0,62726.0,50.0
50%,303.0,2022.0,115000.0,101570.0,100.0
75%,454.5,2022.0,165000.0,150000.0,100.0
max,606.0,2022.0,30400000.0,600000.0,100.0


In [5]:
# Count the missing values in each column of the dataset
df.isnull().sum()

Unnamed: 0            0
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [6]:
# Count and display the top 10 most common job titles in the dataset
jobtitle = df['job_title'].value_counts().head(10)
jobs = pd.DataFrame(jobtitle)
jobs.reset_index()

Unnamed: 0,job_title,count
0,Data Scientist,143
1,Data Engineer,132
2,Data Analyst,97
3,Machine Learning Engineer,41
4,Research Scientist,16
5,Data Science Manager,12
6,Data Architect,11
7,Big Data Engineer,8
8,Machine Learning Scientist,8
9,Principal Data Scientist,7


### Which job title earns the highest average salary? 

In [7]:
# Calculate the mean salary for each job title and sort them in descending order
df.groupby('job_title')['salary_in_usd'].mean().sort_values(ascending=False)

job_title
Data Analytics Lead                         405000.000000
Principal Data Engineer                     328333.333333
Financial Data Analyst                      275000.000000
Principal Data Scientist                    215242.428571
Director of Data Science                    195074.000000
Data Architect                              177873.909091
Applied Data Scientist                      175655.000000
Analytics Engineer                          175000.000000
Data Specialist                             165000.000000
Head of Data                                160162.600000
Machine Learning Scientist                  158412.500000
Data Science Manager                        158328.500000
Director of Data Engineering                156738.000000
Head of Data Science                        146718.750000
Applied Machine Learning Scientist          142068.750000
Lead Data Engineer                          139724.500000
Data Analytics Manager                      127134.285714
Clou

### What is the highest salary?

In [8]:
# Find the highest salary for each job title and sort them in descending order
df.groupby('job_title')['salary_in_usd'].max().sort_values(ascending=False)

job_title
Principal Data Engineer                     600000
Financial Data Analyst                      450000
Research Scientist                          450000
Applied Machine Learning Scientist          423000
Principal Data Scientist                    416000
Data Scientist                              412000
Data Analytics Lead                         405000
Applied Data Scientist                      380000
Director of Data Science                    325000
Data Engineer                               324000
Lead Data Engineer                          276000
ML Engineer                                 270000
Data Architect                              266400
Machine Learning Scientist                  260000
Machine Learning Engineer                   250000
Data Science Manager                        241000
Head of Data                                235000
Head of Data Science                        224000
Analytics Engineer                          205300
Data Analyst         

### What is the most in-demand qualification?

In [9]:
# Count the frequency of each job title in the dataset
df['job_title'].value_counts()

job_title
Data Scientist                              143
Data Engineer                               132
Data Analyst                                 97
Machine Learning Engineer                    41
Research Scientist                           16
Data Science Manager                         12
Data Architect                               11
Big Data Engineer                             8
Machine Learning Scientist                    8
Principal Data Scientist                      7
AI Scientist                                  7
Data Science Consultant                       7
Director of Data Science                      7
Data Analytics Manager                        7
ML Engineer                                   6
Computer Vision Engineer                      6
BI Data Analyst                               6
Lead Data Engineer                            6
Data Engineering Manager                      5
Business Data Analyst                         5
Head of Data                  

### The average salary of a data engineer in 2022?

In [10]:
# Filter the dataset to select data engineer roles in the year 2022
dataEngineer22 = df[(df['job_title'] == 'Data Engineer') & (df['work_year'] == 2022)]

# Calculate the average salary for data engineers in 2022
avgsalary22DE = dataEngineer22['salary_in_usd'].mean()
avgsalary22DE


126375.69662921349

### Employee residence with the highest salary


In [11]:
# Find the highest salary for each employee residence and sort them in descending order
df.groupby('employee_residence')['salary_in_usd'].max().sort_values(ascending=False)

employee_residence
US    600000
JP    260000
RU    230000
IN    200000
MY    200000
CA    196979
GB    183228
DE    173762
PR    160000
BR    160000
IT    153667
AU    150000
ES    130800
NZ    125000
CH    122346
AE    120000
SG    119059
PL    114047
SI    102839
JE    100000
IQ    100000
DZ    100000
FR     93427
AT     91237
GR     88654
BE     88654
NL     85000
BG     80000
RO     76833
BO     75000
IE     71444
CZ     69999
HK     66022
PT     60757
AR     60000
LU     59102
PK     58035
NG     50000
VN     50000
DK     45896
PH     45760
HR     45618
CN     43331
CL     40038
HU     36259
MX     33511
EE     32974
TN     31875
MT     28369
TR     28016
RS     25532
CO     21844
HN     20000
MD     18000
UA     13400
KE      9272
IR      4000
Name: salary_in_usd, dtype: int64

### How many with minimum/maximum experience work remotely?

In [12]:
# Print the count of individuals with 'MI' (minimum experience) in the 'experience_level' column
df['experience_level'].value_counts()['MI']

213