In [11]:
import pandas as pd
import numpy as np

In [12]:
data = pd.read_csv("Dataset.csv")

# Dropping Index column
data.drop(columns=['Unnamed: 0'], inplace=True)
print(data.head())
print(data.shape)

# Dropping null values
print(data.isnull().sum())
data.dropna(axis=0, inplace=True)

    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary Country      Race  
0   90000.0      UK     White  
1   65000.0     USA  Hispanic  
2  150000.0  Canada     White  
3   60000.0     USA  Hispanic  
4  200000.0     USA     Asian  
(6704, 8)
Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
Country                0
Race                   0
dtype: int64


In [13]:
# Counting number of unique countries
print(data["Country"].value_counts())

USA          1359
China        1343
Australia    1336
UK           1335
Canada       1325
Name: Country, dtype: int64


In [14]:
# Renaming column
data.rename(columns={'Salary' : 'Salary (USD)'}, inplace=True)

# Dropping irrelevant columns
data.drop(columns=["Country"], inplace=True)
print(data)

       Age  Gender    Education Level              Job Title  \
0     32.0    Male         Bachelor's      Software Engineer   
1     28.0  Female           Master's           Data Analyst   
2     45.0    Male                PhD         Senior Manager   
3     36.0  Female         Bachelor's        Sales Associate   
4     52.0    Male           Master's               Director   
...    ...     ...                ...                    ...   
6699  49.0  Female                PhD  Director of Marketing   
6700  32.0    Male        High School        Sales Associate   
6701  30.0  Female  Bachelor's Degree      Financial Manager   
6702  46.0    Male    Master's Degree      Marketing Manager   
6703  26.0  Female        High School        Sales Executive   

      Years of Experience  Salary (USD)        Race  
0                     5.0       90000.0       White  
1                     3.0       65000.0    Hispanic  
2                    15.0      150000.0       White  
3              

In [15]:
print(data["Job Title"].unique())
len(data["Job Title"].unique())

['Software Engineer' 'Data Analyst' 'Senior Manager' 'Sales Associate'
 'Director' 'Marketing Analyst' 'Product Manager' 'Sales Manager'
 'Marketing Coordinator' 'Senior Scientist' 'Software Developer'
 'HR Manager' 'Financial Analyst' 'Project Manager' 'Customer Service Rep'
 'Operations Manager' 'Marketing Manager' 'Senior Engineer'
 'Data Entry Clerk' 'Sales Director' 'Business Analyst' 'VP of Operations'
 'IT Support' 'Recruiter' 'Financial Manager' 'Social Media Specialist'
 'Software Manager' 'Junior Developer' 'Senior Consultant'
 'Product Designer' 'CEO' 'Accountant' 'Data Scientist'
 'Marketing Specialist' 'Technical Writer' 'HR Generalist'
 'Project Engineer' 'Customer Success Rep' 'Sales Executive' 'UX Designer'
 'Operations Director' 'Network Engineer' 'Administrative Assistant'
 'Strategy Consultant' 'Copywriter' 'Account Manager'
 'Director of Marketing' 'Help Desk Analyst' 'Customer Service Manager'
 'Business Intelligence Analyst' 'Event Coordinator' 'VP of Finance'
 'G

191

In [16]:
def categorize_job_title(job_title):
    job_title = str(job_title).lower()
    if 'software' in job_title or 'developer' in job_title:
        return 'Software/Developer'
    elif 'data' in job_title or 'analyst' in job_title or 'scientist' in job_title:
        return 'Data Analyst/Scientist'
    elif 'manager' in job_title or 'director' in job_title or 'vp' in job_title:
        return 'Manager/Director/VP'
    elif 'sales' in job_title or 'representative' in job_title:
        return 'Sales'
    elif 'marketing' in job_title or 'social media' in job_title:
        return 'Marketing/Social Media'
    elif 'product' in job_title or 'designer' in job_title:
        return 'Product/Designer'
    elif 'hr' in job_title or 'human resources' in job_title:
        return 'HR/Human Resources'
    elif 'financial' in job_title or 'accountant' in job_title:
        return 'Financial/Accountant'
    elif 'project manager' in job_title:
        return 'Project Manager'
    elif 'it' in job_title or 'support' in job_title:
        return 'IT/Technical Support'
    elif 'operations' in job_title or 'supply chain' in job_title:
        return 'Operations/Supply Chain'
    elif 'customer service' in job_title or 'receptionist' in job_title:
        return 'Customer Service/Receptionist'
    else:
        return 'Other'
data['Job Title'] = data['Job Title'].apply(categorize_job_title)

In [17]:
print(data["Job Title"].unique())
len(data["Job Title"].unique())
print(data["Education Level"].unique())
print(data["Education Level"].value_counts())

['Software/Developer' 'Data Analyst/Scientist' 'Manager/Director/VP'
 'Sales' 'Marketing/Social Media' 'Customer Service/Receptionist' 'Other'
 'IT/Technical Support' 'Product/Designer' 'Financial/Accountant'
 'HR/Human Resources' 'Operations/Supply Chain']
["Bachelor's" "Master's" 'PhD' "Bachelor's Degree" "Master's Degree"
 'High School' 'phD']
Bachelor's Degree    2265
Master's Degree      1572
PhD                  1368
Bachelor's            756
High School           448
Master's              288
phD                     1
Name: Education Level, dtype: int64


In [18]:
def group_education(Educaton):
    Educaton = str(Educaton).lower()
    if 'high school' in Educaton:
        return 'High School'
    elif 'bachelor\'s' in Educaton or 'bachelor\'s degree' in Educaton:
        return 'Bachelors'
    elif 'master\'s' in Educaton or 'master\'s degree' in Educaton:
        return 'Masters'
    elif 'phd' in Educaton:
        return 'PhD'
    
data['Education Level'] = data['Education Level'].apply(group_education)
print(data["Education Level"].value_counts())


Bachelors      3021
Masters        1860
PhD            1369
High School     448
Name: Education Level, dtype: int64


In [19]:
data.to_csv('preprocessed_data.csv', index=False)