In [126]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [127]:
data = pd.read_csv('Salary_Data.csv')

In [128]:
data.shape

(6704, 6)

In [129]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6704 entries, 0 to 6703
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6702 non-null   float64
 1   Gender               6702 non-null   object 
 2   Education Level      6701 non-null   object 
 3   Job Title            6702 non-null   object 
 4   Years of Experience  6701 non-null   float64
 5   Salary               6699 non-null   float64
dtypes: float64(3), object(3)
memory usage: 314.4+ KB


In [130]:
# Conteo de los niveles en las diferentes columnas categóricas
cols_cat = ['Gender', 'Education Level', 'Job Title']

for col in cols_cat:
  print(f'Columna {col}: {data[col].unique()} subniveles\n')

Columna Gender: ['Male' 'Female' nan 'Other'] subniveles

Columna Education Level: ["Bachelor's" "Master's" 'PhD' nan "Bachelor's Degree" "Master's Degree"
 'High School' 'phD'] subniveles

Columna Job Title: ['Software Engineer' 'Data Analyst' 'Senior Manager' 'Sales Associate'
 'Director' 'Marketing Analyst' 'Product Manager' 'Sales Manager'
 'Marketing Coordinator' 'Senior Scientist' 'Software Developer'
 'HR Manager' 'Financial Analyst' 'Project Manager' 'Customer Service Rep'
 'Operations Manager' 'Marketing Manager' 'Senior Engineer'
 'Data Entry Clerk' 'Sales Director' 'Business Analyst' 'VP of Operations'
 'IT Support' 'Recruiter' 'Financial Manager' 'Social Media Specialist'
 'Software Manager' 'Junior Developer' 'Senior Consultant'
 'Product Designer' 'CEO' 'Accountant' 'Data Scientist'
 'Marketing Specialist' 'Technical Writer' 'HR Generalist'
 'Project Engineer' 'Customer Success Rep' 'Sales Executive' 'UX Designer'
 'Operations Director' 'Network Engineer' 'Administrative 

In [131]:
for col in cols_cat:
  print(f'Columna {col}: {data[col].nunique()} subniveles\n')

Columna Gender: 3 subniveles

Columna Education Level: 7 subniveles

Columna Job Title: 193 subniveles



# Etapa 1 - Datos faltantes

In [132]:
data.isnull().sum()

Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

In [133]:
data.dropna(inplace = True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6698 entries, 0 to 6703
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6698 non-null   float64
 1   Gender               6698 non-null   object 
 2   Education Level      6698 non-null   object 
 3   Job Title            6698 non-null   object 
 4   Years of Experience  6698 non-null   float64
 5   Salary               6698 non-null   float64
dtypes: float64(3), object(3)
memory usage: 366.3+ KB


# Etapa 2 - Columnas irrelevantes

### Renombramiento de columnas

In [134]:
data.rename(columns={'Education Level': 'Education_Level',
                      'Job Title': 'Job_Title',
                      'Years of Experience': 'Years_of_Experience'}, inplace= True)
data.columns

Index(['Age', 'Gender', 'Education_Level', 'Job_Title', 'Years_of_Experience',
       'Salary'],
      dtype='object')

### Transformación de valores a minúscula

In [135]:
columnas_categoricas = ['Gender', 'Education_Level', "Job_Title"]

#Se transforman en minúscula
for column in data.columns:
    # Representar en minúsculas sólo si la columna es categórica
    if column in columnas_categoricas:
        data[column] = data[column].str.lower()

data.head()



Unnamed: 0,Age,Gender,Education_Level,Job_Title,Years_of_Experience,Salary
0,32.0,male,bachelor's,software engineer,5.0,90000.0
1,28.0,female,master's,data analyst,3.0,65000.0
2,45.0,male,phd,senior manager,15.0,150000.0
3,36.0,female,bachelor's,sales associate,7.0,60000.0
4,52.0,male,master's,director,20.0,200000.0


In [136]:
# Conteo de los niveles en las diferentes columnas categóricas
cols_cat = ['Gender', 'Education_Level', 'Job_Title']

In [137]:
for col in cols_cat:
  print(f'Columna {col}: {data[col].unique()} subniveles\n')


Columna Gender: ['male' 'female' 'other'] subniveles

Columna Education_Level: ["bachelor's" "master's" 'phd' "bachelor's degree" "master's degree"
 'high school'] subniveles

Columna Job_Title: ['software engineer' 'data analyst' 'senior manager' 'sales associate'
 'director' 'marketing analyst' 'product manager' 'sales manager'
 'marketing coordinator' 'senior scientist' 'software developer'
 'hr manager' 'financial analyst' 'project manager' 'customer service rep'
 'operations manager' 'marketing manager' 'senior engineer'
 'data entry clerk' 'sales director' 'business analyst' 'vp of operations'
 'it support' 'recruiter' 'financial manager' 'social media specialist'
 'software manager' 'junior developer' 'senior consultant'
 'product designer' 'ceo' 'accountant' 'data scientist'
 'marketing specialist' 'technical writer' 'hr generalist'
 'project engineer' 'customer success rep' 'sales executive' 'ux designer'
 'operations director' 'network engineer' 'administrative assistant'
 's

In [138]:
for col in cols_cat:
  print(f'Columna {col}: {data[col].nunique()} subniveles\n')

Columna Gender: 3 subniveles

Columna Education_Level: 6 subniveles

Columna Job_Title: 190 subniveles



### Columnas numéricas

In [139]:
data.describe()

Unnamed: 0,Age,Years_of_Experience,Salary
count,6698.0,6698.0,6698.0
mean,33.623022,8.095178,115329.253061
std,7.615784,6.060291,52789.792507
min,21.0,0.0,350.0
25%,28.0,3.0,70000.0
50%,32.0,7.0,115000.0
75%,38.0,12.0,160000.0
max,62.0,34.0,250000.0


### Valores de la columna Education_Level reducido/unidos

In [140]:
data['Education_Level'].unique()

array(["bachelor's", "master's", 'phd', "bachelor's degree",
       "master's degree", 'high school'], dtype=object)

In [141]:
# Define a dictionary to map current values to new standardized values
education_map = {
    "bachelor's degree": "bachelor's",
    "master's degree": "master's",
}

# Use the map to replace the values
data['Education_Level'] = data['Education_Level'].replace(education_map)

data['Education_Level'].unique()

array(["bachelor's", "master's", 'phd', 'high school'], dtype=object)

### Reducción de valores en la columna de Job_Title

In [142]:
data['Job_Title'].value_counts()

Job_Title
software engineer             518
data scientist                453
software engineer manager     376
data analyst                  363
senior project engineer       318
                             ... 
copywriter                      1
account manager                 1
help desk analyst               1
senior training specialist      1
software project manager        1
Name: count, Length: 190, dtype: int64

In [143]:
# Define a list of classifications
classifications = ['Engineer', 'Analyst', 'Associate', 'Director', 
                   'Coordinator', 'Scientist', 'Developer', 'Representative',
                   'Clerk', 'Rep', 'Support', 'Specialist', 'Consultant', 
                   'Designer', 'Writer', 'Generalist', 'Researcher', 
                   'Accountant', 'Recruiter', 'Officer', 'Advisor', 
                   'Producer', 'Copywriter']  # 'Manager' removed

# Create a new column 'Job Classification' initialized with 'Other'
data['Job Classification'] = 'Other'

# Loop through classifications and assign them to 'Job Classification' where appropriate
for classification in classifications:
    data.loc[data['Job Title'].str.contains(classification, case=False, na=False), 'Job Classification'] = classification

# Titles like 'CEO', 'VP', 'Executive', and 'Man' to be classified as 'Executive'
executives = ['CEO', 'VP', 'Executive', 'Man']
for executive in executives:
    data.loc[data['Job Title'].str.contains(executive, case=False, na=False), 'Job Classification'] = 'Executive' 

# Classify 'Manager'
data.loc[data['Job Title'].str.contains('Manager', case=False, na=False), 'Job Classification'] = 'Manager'

# Verify the changes
print(data['Job Classification'].value_counts())

KeyError: 'Job Title'

In [None]:
#data.drop('Job_Title', axis = 1, inplace = True)
#data.rename(columns={'Job_Classification': 'Job_Title'}, inplace = True)

In [144]:
# Verify the changes
data['Job_Title'].value_counts()

Job_Title
software engineer             518
data scientist                453
software engineer manager     376
data analyst                  363
senior project engineer       318
                             ... 
copywriter                      1
account manager                 1
help desk analyst               1
senior training specialist      1
software project manager        1
Name: count, Length: 190, dtype: int64

In [None]:
for col in cols_cat:
  print(f'Columna {col}: {data[col].unique()} subniveles\n')

Columna Gender: ['male' 'female' 'other'] subniveles

Columna Education_Level: ["bachelor's" "master's" 'phd' 'high school'] subniveles

Columna Job_Title: ['Engineer' 'Analyst' 'Manager' 'Associate' 'Director' 'Coordinator'
 'Scientist' 'Developer' 'Other'] subniveles

