In [61]:
#Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statistics

In [62]:
import yaml
 
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [63]:
config

{'input_data': {'file': '../data/raw/salaries.csv'},
 'output_data': {'file': '../data/clean/AI_ML_Salaries_clean.csv'}}

In [89]:
df = pd.read_csv(config['output_data']['file'])
df

Unnamed: 0,Year,Experience_Level,Employment_Type,Position,Currency,Salary,Country,Remote_Amount,Company_Size
0,2025,Senior Level,Full-time,"Products, Research & Development",USD,170000,US,0,Medium
1,2025,Senior Level,Full-time,"Products, Research & Development",USD,110000,US,0,Medium
2,2025,Senior Level,Full-time,"Products, Research & Development",USD,170000,US,0,Medium
3,2025,Senior Level,Full-time,"Products, Research & Development",USD,110000,US,0,Medium
4,2025,Senior Level,Full-time,"Products, Research & Development",USD,143000,US,0,Medium
...,...,...,...,...,...,...,...,...,...
65872,2021,Senior Level,Full-time,"Data Science, Engineers & Analytics",USD,165000,US,100,Large
65873,2020,Senior Level,Full-time,"Data Science, Engineers & Analytics",USD,412000,US,100,Large
65874,2021,Mid Level,Full-time,"Data Science, Engineers & Analytics",USD,151000,US,100,Large
65875,2020,Entry Level,Full-time,"Data Science, Engineers & Analytics",USD,105000,US,100,Small


In [90]:
df.columns 

Index(['Year', 'Experience_Level', 'Employment_Type', 'Position', 'Currency',
       'Salary', 'Country', 'Remote_Amount', 'Company_Size'],
      dtype='object')

In [91]:
# Check data format
df.info

<bound method DataFrame.info of        Year Experience_Level Employment_Type  \
0      2025    Senior Level        Full-time   
1      2025    Senior Level        Full-time   
2      2025    Senior Level        Full-time   
3      2025    Senior Level        Full-time   
4      2025    Senior Level        Full-time   
...     ...              ...             ...   
65872  2021    Senior Level        Full-time   
65873  2020    Senior Level        Full-time   
65874  2021       Mid Level        Full-time   
65875  2020     Entry Level        Full-time   
65876  2020     Entry Level         Contract   

                                  Position Currency  Salary Country  \
0         Products, Research & Development      USD  170000      US   
1         Products, Research & Development      USD  110000      US   
2         Products, Research & Development      USD  170000      US   
3         Products, Research & Development      USD  110000      US   
4         Products, Research & Devel

In [92]:
# Check # of rows and columns
df.shape

(65877, 9)

In [93]:
# Check for numbers and strings
df.dtypes

Year                 int64
Experience_Level    object
Employment_Type     object
Position            object
Currency            object
Salary               int64
Country             object
Remote_Amount        int64
Company_Size        object
dtype: object

In [94]:
# Check for null values
df.isna().any()

Year                False
Experience_Level    False
Employment_Type     False
Position            False
Currency            False
Salary              False
Country             False
Remote_Amount       False
Company_Size        False
dtype: bool

In [95]:
list(set(df.dtypes.tolist()))

[dtype('int64'), dtype('O')]

In [96]:
# Extracting column names with numerical data types from the dataframe
df.select_dtypes("number").columns

Index(['Year', 'Salary', 'Remote_Amount'], dtype='object')

In [97]:
# Extracting column names with categorical data types from the dataframe
df.select_dtypes("object").columns

Index(['Experience_Level', 'Employment_Type', 'Position', 'Currency',
       'Country', 'Company_Size'],
      dtype='object')

In [98]:
# Counting and sorting the unique values for each numerical column in descending order
df.select_dtypes("number").nunique().sort_values(ascending=False)

Salary           6173
Year                6
Remote_Amount       3
dtype: int64

In [99]:
# Counting and sorting the unique values for each numerical column in descending order
df.select_dtypes("object").nunique().sort_values(ascending=False)

Position            6
Experience_Level    4
Employment_Type     3
Company_Size        3
Currency            1
Country             1
dtype: int64

In [100]:
# Showing only numbers
only_num = df.select_dtypes("number")
only_num 

Unnamed: 0,Year,Salary,Remote_Amount
0,2025,170000,0
1,2025,110000,0
2,2025,170000,0
3,2025,110000,0
4,2025,143000,0
...,...,...,...
65872,2021,165000,100
65873,2020,412000,100
65874,2021,151000,100
65875,2020,105000,100


In [102]:
# Showing only object
only_cat = df.select_dtypes("object")
only_cat 

Unnamed: 0,Experience_Level,Employment_Type,Position,Currency,Country,Company_Size
0,Senior Level,Full-time,"Products, Research & Development",USD,US,Medium
1,Senior Level,Full-time,"Products, Research & Development",USD,US,Medium
2,Senior Level,Full-time,"Products, Research & Development",USD,US,Medium
3,Senior Level,Full-time,"Products, Research & Development",USD,US,Medium
4,Senior Level,Full-time,"Products, Research & Development",USD,US,Medium
...,...,...,...,...,...,...
65872,Senior Level,Full-time,"Data Science, Engineers & Analytics",USD,US,Large
65873,Senior Level,Full-time,"Data Science, Engineers & Analytics",USD,US,Large
65874,Mid Level,Full-time,"Data Science, Engineers & Analytics",USD,US,Large
65875,Entry Level,Full-time,"Data Science, Engineers & Analytics",USD,US,Small
