In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from scipy import stats
from matplotlib import cm

plt.rc('font', family='Malgun Gothic')
plt.style.use('ggplot')
plt.rcParams['axes.facecolor'] = '#f5f5f5'
warnings.filterwarnings('ignore')

colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000']


In [2]:
df = pd.read_csv("ds_salaries.csv")
df2 = df.drop(columns=["Unnamed: 0"])
df2.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607 entries, 0 to 606
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          607 non-null    int64 
 1   work_year           607 non-null    int64 
 2   experience_level    607 non-null    object
 3   employment_type     607 non-null    object
 4   job_title           607 non-null    object
 5   salary              607 non-null    int64 
 6   salary_currency     607 non-null    object
 7   salary_in_usd       607 non-null    int64 
 8   employee_residence  607 non-null    object
 9   remote_ratio        607 non-null    int64 
 10  company_location    607 non-null    object
 11  company_size        607 non-null    object
dtypes: int64(5), object(7)
memory usage: 57.0+ KB


In [3]:
# 경험별 평균 연봉
x = df2.groupby('experience_level')['salary_in_usd'].mean()
x

experience_level
EN     61643.318182
EX    199392.038462
MI     87996.056338
SE    138617.292857
Name: salary_in_usd, dtype: float64

In [4]:
# 가장 많은 DS 관련 직업
df2["job_title"].value_counts()


Data Scientist                              143
Data Engineer                               132
Data Analyst                                 97
Machine Learning Engineer                    41
Research Scientist                           16
Data Science Manager                         12
Data Architect                               11
Big Data Engineer                             8
Machine Learning Scientist                    8
Principal Data Scientist                      7
AI Scientist                                  7
Data Science Consultant                       7
Director of Data Science                      7
Data Analytics Manager                        7
ML Engineer                                   6
Computer Vision Engineer                      6
BI Data Analyst                               6
Lead Data Engineer                            6
Data Engineering Manager                      5
Business Data Analyst                         5
Head of Data                            

In [5]:
df_2020 = df2[df2["work_year"] == 2020]
df_2020.median()

work_year         2020.0
salary           94500.0
salary_in_usd    75544.0
remote_ratio        75.0
dtype: float64

In [6]:
df2.loc[df2["salary_in_usd"] == df2["salary_in_usd"].max()]

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
252,2021,EX,FT,Principal Data Engineer,600000,USD,600000,US,100,US,L


In [7]:
# 미국에 사는 사람 중 USD 중에 가장 낮은 연봉
usd = df2[(df2["salary_currency"] == "USD") & (df2["employee_residence"] == "US")]
usd_min = usd["salary_in_usd"].min()
df2[df2["salary_in_usd"] == usd_min]

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
579,2022,SE,FT,Data Engineer,25000,USD,25000,US,100,US,M


In [9]:
df2[df2["employee_residence"] == "JP"]

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
16,2020,EN,FT,Data Engineer,4450000,JPY,41689,JP,100,JP,S
136,2021,MI,FT,ML Engineer,7000000,JPY,63711,JP,50,JP,S
137,2021,MI,FT,ML Engineer,8500000,JPY,77364,JP,50,JP,S
150,2021,SE,FT,Director of Data Science,168000,USD,168000,JP,0,JP,S
189,2021,MI,FT,Machine Learning Engineer,74000,USD,74000,JP,50,JP,S
502,2022,EN,FT,Data Scientist,40000,USD,40000,JP,100,MY,L
