# 1- Importando bibliotecas e dados

In [61]:
import pandas as pd

In [62]:
dados = pd.read_csv('Salary Dataset.csv')
dados.head()

Unnamed: 0,Company Name,Job Title,Salaries Reported,Location,Salary
0,Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr"
1,IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr"
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr"
3,Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr"
4,Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr"


In [63]:
dados.shape

(4344, 5)

In [64]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4344 entries, 0 to 4343
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Company Name       4341 non-null   object 
 1   Job Title          4344 non-null   object 
 2   Salaries Reported  4342 non-null   float64
 3   Location           4344 non-null   object 
 4   Salary             4344 non-null   object 
dtypes: float64(1), object(4)
memory usage: 169.8+ KB


# 2- Ajustando nomes das variáveis

In [65]:
dados.rename(columns = {
    'Company Name': 'empresa',
    'Job Title': 'cargo',
    'Salaries Reported': 'salarios_informados',
    'Location': 'cidade',
    'Salary': 'salario'}, inplace = True)

dados.head()

Unnamed: 0,empresa,cargo,salarios_informados,cidade,salario
0,Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr"
1,IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr"
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr"
3,Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr"
4,Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr"


# 3- Tratando os dados nulos

In [66]:
dados.isnull().sum()

empresa                3
cargo                  0
salarios_informados    2
cidade                 0
salario                0
dtype: int64

In [67]:
dados[dados.empresa.isnull()]

Unnamed: 0,empresa,cargo,salarios_informados,cidade,salario
2279,,Data Analyst,1.0,Pune,"₹23,500/mo"
3227,,Data Engineer,1.0,Pune,"₹26,20,604/yr"
4026,,Machine Learning Engineer/Data Scientist,1.0,Pune,"₹56,465/mo"


In [68]:
dados[dados.salarios_informados.isnull()]

Unnamed: 0,empresa,cargo,salarios_informados,cidade,salario
4027,First Student Data Scientist,Data Analyst,,Pune,"₹4,53,300/yr"
4237,Amazon,Machine Learning Data Associate II,,New Delhi,"₹3,29,439/yr"


In [69]:
dados = dados.dropna()
dados.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4339 entries, 0 to 4343
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   empresa              4339 non-null   object 
 1   cargo                4339 non-null   object 
 2   salarios_informados  4339 non-null   float64
 3   cidade               4339 non-null   object 
 4   salario              4339 non-null   object 
dtypes: float64(1), object(4)
memory usage: 203.4+ KB


# 4- Preparando coluna salário

In [70]:
# Checando as unidades de medidas

dados.salario.apply(lambda i : i.split('/')[-1]).unique()

array(['yr', 'mo', 'hr'], dtype=object)

In [71]:
# Verificando a moeda informada

dados.salario.apply(lambda i : i[0]).unique()

array(['₹', '$', '£', 'A'], dtype=object)

In [72]:
# Identificando o que signica a moeda "A"

dados[dados.salario.apply(lambda i : i[0] == 'A')]

Unnamed: 0,empresa,cargo,salarios_informados,cidade,salario
2278,md,Data Analyst,1.0,Pune,"AFN 1,56,179/yr"


Significa que é moeda local do Afeganistão

In [73]:
# Criando função para converter moedas pra Real (R$)

def converter(salario):
    tipo = salario[0] #extraindo o tipo da moeda
    freq = salario.split('/')[-1] #extraindo a forma de pagamento
    salario = float(salario.split('/')[0][1:].replace (',', '').replace('FN', '')) #extraindo o valor numérico
    
    # Convertendo a modea para Real
    if tipo == '₹':
        salario *= 0.0688
    elif tipo == '£':
        salario *= 5.8217
    elif tipo == 'A':
        salario *= 0.05582
        
    # Convertendo a frequencia para mês
    if freq == 'yr':
        salario /= 12
    elif freq == 'hr':
        salario *= 220
        
    #Retornando o salário final
    return salario

In [74]:
# Aplicando a função no dataset

dados['salario_mes_real'] = dados.salario.apply(converter)
dados.head()

Unnamed: 0,empresa,cargo,salarios_informados,cidade,salario,salario_mes_real
0,Mu Sigma,Data Scientist,105.0,Bangalore,"₹6,48,573/yr",3718.4852
1,IBM,Data Scientist,95.0,Bangalore,"₹11,91,950/yr",6833.846667
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,"₹8,36,874/yr",4798.0776
3,Impact Analytics,Data Scientist,40.0,Bangalore,"₹6,69,578/yr",3838.913867
4,Accenture,Data Scientist,32.0,Bangalore,"₹9,44,110/yr",5412.897333


In [75]:
# Excluindo a coluna salario antiga

dados.drop('salario', axis = 1, inplace = True)
dados.head()

Unnamed: 0,empresa,cargo,salarios_informados,cidade,salario_mes_real
0,Mu Sigma,Data Scientist,105.0,Bangalore,3718.4852
1,IBM,Data Scientist,95.0,Bangalore,6833.846667
2,Tata Consultancy Services,Data Scientist,66.0,Bangalore,4798.0776
3,Impact Analytics,Data Scientist,40.0,Bangalore,3838.913867
4,Accenture,Data Scientist,32.0,Bangalore,5412.897333
