# Análisis exploratorio - Salarios de DS

En este notebook vamos a trabajar con los datos salariales de Data Scientists a nivel globlal.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_salaries = pd.read_csv('data/ds_salaries_2.csv')
df_salaries

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
3752,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


In [3]:
df_sl_paises = df_salaries.groupby('employee_residence')['work_year'].count().reset_index()
df_sl_paises

Unnamed: 0,employee_residence,work_year
0,AE,3
1,AM,1
2,AR,6
3,AS,2
4,AT,6
...,...,...
73,TR,5
74,UA,4
75,US,3004
76,UZ,2


In [4]:
#Buscamos repetidos
df_salaries.duplicated().sum()

np.int64(1171)

Vemos que hay muchos duplicados, y que no tiene sentido mantener esos datos. Hay que borrarlos.

In [5]:
aux = df_salaries.copy()
aux['count'] = aux.index
df_duplicates_rank = aux.groupby(list(df_salaries.columns))['count'].count().reset_index()
df_duplicates_rank.sort_values(by="count", ascending=False)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,count
1218,2022,SE,FT,Data Scientist,141525,USD,141525,US,100,US,M,21
1275,2022,SE,FT,Data Scientist,191475,USD,191475,US,100,US,M,21
2217,2023,SE,FT,Data Engineer,252000,USD,252000,US,0,US,M,13
2107,2023,SE,FT,Data Engineer,129000,USD,129000,US,0,US,M,13
998,2022,SE,FT,Data Engineer,130000,USD,130000,US,0,US,M,12
...,...,...,...,...,...,...,...,...,...,...,...,...
19,2020,EN,FT,Research Scientist,42000,USD,42000,NL,50,NL,L,1
20,2020,EN,PT,Data Scientist,19000,EUR,21669,IT,50,IT,S,1
21,2020,EN,PT,ML Engineer,14000,EUR,15966,DE,100,DE,S,1
22,2020,EX,FT,Data Engineer,70000,EUR,79833,ES,50,ES,L,1


Nos quedamos con los datos deduplicados

In [9]:
df_salaries_curated = df_salaries.drop_duplicates()
df_salaries_curated['id'] = 1
df_salaries_curated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_salaries_curated['id'] = 1


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,id
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L,1
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S,1
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S,1
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M,1
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L,1
3751,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L,1
3752,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S,1
3753,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L,1


## EDA
Aquí nos haremos preguntas sobre los datos para poder tener un mejor conocimiento de los mismos, y ver cómo orientamos los análisis a realizar. Cada pregunta tiene una celda (Markdown y código asociado).

### ¿En qué país hay más compañías grandes?

In [10]:
df_biggest_companies = df_salaries_curated.groupby(['company_location', 'company_size'])['id'].count().reset_index()
df_biggest_companies = df_biggest_companies[df_biggest_companies['company_size'] == 'L'].sort_values(by='id', ascending=False)
df_biggest_companies
#df_biggest_companies['company_size'] == 'L'

Unnamed: 0,company_location,company_size,id
129,US,L,220
73,IN,L,37
54,GB,L,20
24,CA,L,18
37,DE,L,16
51,FR,L,10
98,NL,L,10
12,AU,L,8
46,ES,L,6
108,PT,L,5


### Tipo de trabajo, país de residencia en el que más se cobra?

In [8]:
df_res_co_job_best_salaries = (df_salaries_curated.groupby(['employee_residence', 'job_title'])['salary_in_usd']
                                                  .mean()
                                                  .reset_index())
df_res_co_job_best_salaries = df_res_co_job_best_salaries.sort_values(by=['job_title', 'salary_in_usd'], ascending=False)
df_res_co_job_best_salaries

Unnamed: 0,employee_residence,job_title,salary_in_usd
376,US,Staff Data Scientist,105000.0
241,NG,Staff Data Analyst,15000.0
293,SG,Software Data Engineer,75020.0
240,NG,Software Data Engineer,50000.0
110,ES,Research Scientist,211475.0
...,...,...,...
227,MK,AI Developer,6304.0
307,US,3D Computer Vision Researcher,50000.0
8,AS,3D Computer Vision Researcher,20000.0
40,CA,3D Computer Vision Researcher,10000.0


### Cuánto se cobra de media por cada nivel de experiencia?

In [9]:
df_avg_salary_experience = (df_salaries_curated.groupby(['experience_level', 'employment_type'])['salary_in_usd']
                                               .mean()
                                               .reset_index()
)
df_avg_salary_experience = df_avg_salary_experience.sort_values(by=['employment_type', 'experience_level'], ascending=False)
df_avg_salary_experience

Unnamed: 0,experience_level,employment_type,salary_in_usd
9,MI,PT,42561.0
3,EN,PT,38885.0
12,SE,FT,154165.019367
8,MI,FT,102742.781874
5,EX,FT,188710.610526
2,EN,FT,74564.626984
11,SE,FL,53333.333333
7,MI,FL,41615.6
1,EN,FL,75000.0
10,SE,CT,97500.0


### En qué tamaño de compañía se cobra más?

In [12]:
df_company_size_salaries = (df_salaries_curated.groupby(by=['company_location', 'company_size'])['salary_in_usd']
                                               .mean()
                                               .reset_index()
)
df_company_size_salaries.sort_values(by=['company_location', '', ascending=False)

Unnamed: 0,company_location,company_size,salary_in_usd
0,AE,L,115000.000000
1,AE,S,92500.000000
2,AL,S,10000.000000
3,AM,S,50000.000000
4,AR,L,31000.000000
...,...,...,...
128,UA,S,50000.000000
129,US,L,151464.422727
130,US,M,153984.837560
131,US,S,105847.339623


### Por qué % promedio se incrementa el salario entre niveles de experiencia?

### En qué país se cobra más por tipo de empleo?

In [17]:
df_jobtype_country_best_salary = (df_salaries_curated.groupby(by=['company_location', 'job_title'])['salary_in_usd']
                                                     .mean()
                                                     .reset_index()
)
df_jobtype_country_best_salary.sort_values(by=['job_title', 'salary_in_usd'], ascending=False)

Unnamed: 0,company_location,job_title,salary_in_usd
354,US,Staff Data Scientist,105000.0
63,CA,Staff Data Analyst,15000.0
272,SG,Software Data Engineer,75020.0
24,AU,Software Data Engineer,50000.0
114,ES,Research Scientist,211475.0
...,...,...,...
220,MK,AI Developer,6304.0
73,CR,3D Computer Vision Researcher,50000.0
7,AS,3D Computer Vision Researcher,20000.0
2,AL,3D Computer Vision Researcher,10000.0


### En qué nivel de expriencia hay más empleados?

In [11]:
df_exp_lvl_employees = (df_salaries_curated.groupby(by=['experience_level'])
                                           .agg({'id': 'count'}).rename(columns={'id': 'total'})
                                           .sort_values(by='total', ascending=False)
                                           .reset_index()
)
df_exp_lvl_employees

Unnamed: 0,experience_level,total
0,SE,1554
1,MI,664
2,EN,270
3,EX,96
