In [25]:
#Libraries
import pandas as pd
import numpy as np
import seaborn as sns


In [35]:
import yaml
 
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [36]:
config

{'input_data': {'file': '../data/raw/salaries.csv'},
 'output_data': {'file': '../data/clean/AI_ML_Salaries_clean.csv'}}

In [37]:
df = pd.read_csv(config['input_data']['file'])
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M
1,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M
2,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M
3,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M
4,2025,SE,FT,Engineer,143000,USD,143000,US,0,US,M
...,...,...,...,...,...,...,...,...,...,...,...
73143,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
73144,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
73145,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
73146,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


In [38]:
# Check names of all columns
df.columns 

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [31]:
# Check data format
df.info

<bound method DataFrame.info of        work_year experience_level employment_type                 job_title  \
0           2025               SE              FT        Data Product Owner   
1           2025               SE              FT        Data Product Owner   
2           2025               SE              FT        Data Product Owner   
3           2025               SE              FT        Data Product Owner   
4           2025               SE              FT                  Engineer   
...          ...              ...             ...                       ...   
73143       2020               SE              FT            Data Scientist   
73144       2021               MI              FT  Principal Data Scientist   
73145       2020               EN              FT            Data Scientist   
73146       2020               EN              CT     Business Data Analyst   
73147       2021               SE              FT            Data Scientist   

        salary sala

In [32]:
# Check # of rows and columns
df.shape

(73148, 11)

In [33]:
# Check for numbers and strings
df.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [34]:
# Check for null values
df.isna().any()

work_year             False
experience_level      False
employment_type       False
job_title             False
salary                False
salary_currency       False
salary_in_usd         False
employee_residence    False
remote_ratio          False
company_location      False
company_size          False
dtype: bool

In [11]:
# Check for duplicates
df.duplicated().sum()

np.int64(39124)

In [12]:
# Verifying duplicates
df.duplicated().any()

np.True_

In [20]:
# Remove duplicates and update the DataFrame
new_df = df.drop_duplicates()

In [21]:
# Remove duplicates and reset the index
new_df.reset_index(drop=True)
new_df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M
1,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M
4,2025,SE,FT,Engineer,143000,USD,143000,US,0,US,M
5,2025,SE,FT,Engineer,106000,USD,106000,US,0,US,M
6,2025,SE,FT,Manager,360200,USD,360200,US,0,US,M


In [22]:
new_df.shape

(34024, 11)

In [23]:
# Drop multiple columns by name
new_df = df.drop(columns=['salary', 'company_location'])
new_df 

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_size
0,2025,SE,FT,Data Product Owner,USD,170000,US,0,M
1,2025,SE,FT,Data Product Owner,USD,110000,US,0,M
4,2025,SE,FT,Engineer,USD,143000,US,0,M
5,2025,SE,FT,Engineer,USD,106000,US,0,M
6,2025,SE,FT,Manager,USD,360200,US,0,M
...,...,...,...,...,...,...,...,...,...
73143,2020,SE,FT,Data Scientist,USD,412000,US,100,L
73144,2021,MI,FT,Principal Data Scientist,USD,151000,US,100,L
73145,2020,EN,FT,Data Scientist,USD,105000,US,100,S
73146,2020,EN,CT,Business Data Analyst,USD,100000,US,100,L


In [24]:
# Rename multiple columns at once
new_df.rename(
    columns={
        'work_year': 'Year',
        'experience_level': 'Experience_Level',
        'employment_type': 'Employment_Type',
        'job_title': 'Position',
        'salary_currency': 'Currency',
        'salary_in_usd': 'Salary',
        'employee_residence': 'Country',
        'remote_ratio': 'Remote_Amount',
        'company_size': 'Company_Size'
        
    }, 
    inplace=True
)

print(new_df.columns)

Index(['Year', 'Experience_Level', 'Employment_Type', 'Position', 'Currency',
       'Salary', 'Country', 'Remote_Amount', 'Company_Size'],
      dtype='object')


In [27]:
job_titles = new_df['Position'].nunique()
print(job_titles)

289


In [28]:
job_titles = new_df['Position'].unique()
print(job_titles)

['Data Product Owner' 'Engineer' 'Manager' 'Data Scientist' 'AI Architect'
 'Data Engineer' 'AI Engineer' 'Research Engineer' 'Data Analyst'
 'Business Intelligence Developer' 'Data Manager'
 'Data Management Specialist' 'Quantitative Developer' 'Software Engineer'
 'Technical Lead' 'Sales Development Representative' 'Consultant'
 'Analyst' 'System Engineer' 'Product Manager' 'Analytics Engineer'
 'Solution Architect' 'Associate' 'Encounter Data Management Professional'
 'Data Infrastructure Engineer' 'Data Architect' 'Data Team Lead'
 'Business Intelligence Lead' 'DevOps Engineer' 'Decision Scientist'
 'Machine Learning Engineer' 'Architect' 'Data Visualization Engineer'
 'Applied Scientist' 'Data Governance Analyst' 'Data Governance'
 'Data Quality Analyst' 'Lead Analyst' 'Data Specialist'
 'Research Scientist' 'Software Developer' 'Business Intelligence Analyst'
 'Solutions Architect' 'Engineering Manager' 'MLOps Engineer'
 'Business Intelligence Engineer' 'Data Developer'
 'Data an