In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Reading in the data
df = pd.read_csv("LinkedInJobs_MLDataset.csv")

In [3]:
# Define new column names
new_column_names = {
    'Co_Nm': 'Company_Name',
    'Co_Pg_Lstd': 'Company_Page_Listed',
    'Emp_Cnt': 'Employee_Count',
    'Flw_Cnt': 'Followers_Count',
    'Job_Ttl': 'Job_Title',
    'Job_Desc': 'Job_Description',
    'Is_Supvsr': 'Is_Supervisor',
    'max_sal': 'Max_Salary',
    'med_sal': 'Median_Salary',
    'min_sal': 'Min_Salary',
    'py_prd': 'Posting_Period',
    'wrk_typ': 'Work_Type',
    'loc': 'Location',
    'st_code': 'State_Code',
    'is_remote': 'Is_Remote',
    'views': 'Views',
    'app_typ': 'Application_Type',
    'app_is_off': 'Application_Is_Offsite',
    'xp_lvl': 'Experience_Level',
    'domain': 'Domain',
    'has_post_domain': 'Has_Posting_Domain',
    'is_sponsored': 'Is_Sponsored',
    'base_comp': 'Base_Compensation'
}

# Rename columns
df.rename(columns=new_column_names, inplace=True)

# Check the updated column names
print(df.columns)

Index(['Company_Name', 'Company_Page_Listed', 'Employee_Count',
       'Followers_Count', 'Job_Title', 'Job_Description', 'Is_Supervisor',
       'Max_Salary', 'Median_Salary', 'Min_Salary', 'Posting_Period',
       'py_lstd', 'Work_Type', 'Location', 'State_Code', 'Is_Remote', 'Views',
       'Application_Type', 'Application_Is_Offsite', 'Experience_Level',
       'Domain', 'Has_Posting_Domain', 'Is_Sponsored', 'Base_Compensation'],
      dtype='object')


In [4]:
#Checking the columns for data type information and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33246 entries, 0 to 33245
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Company_Name            33242 non-null  object 
 1   Company_Page_Listed     33246 non-null  bool   
 2   Employee_Count          33246 non-null  int64  
 3   Followers_Count         33246 non-null  int64  
 4   Job_Title               33246 non-null  object 
 5   Job_Description         33244 non-null  object 
 6   Is_Supervisor           33246 non-null  bool   
 7   Max_Salary              33246 non-null  float64
 8   Median_Salary           33246 non-null  float64
 9   Min_Salary              33246 non-null  float64
 10  Posting_Period          33246 non-null  object 
 11  py_lstd                 33246 non-null  bool   
 12  Work_Type               33246 non-null  object 
 13  Location                33246 non-null  object 
 14  State_Code              33246 non-null

In [5]:
df2=df.loc[df.Min_Salary>0]

df2

Unnamed: 0,Company_Name,Company_Page_Listed,Employee_Count,Followers_Count,Job_Title,Job_Description,Is_Supervisor,Max_Salary,Median_Salary,Min_Salary,...,State_Code,Is_Remote,Views,Application_Type,Application_Is_Offsite,Experience_Level,Domain,Has_Posting_Domain,Is_Sponsored,Base_Compensation
0,HearingLife,True,1171,11417,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,False,5250.00,5250.00,5250.00,...,SC,0,9,OffsiteApply,True,Entry level,careers-demant.icims.com,True,0,1
3,Episcopal Communities & Services,True,36,305,Cook,descriptionTitle\n\n Looking for a great oppor...,False,22.27,22.27,22.27,...,CA,0,1,OffsiteApply,True,Entry level,jobs.apploi.com,True,0,1
4,"iHerb, LLC",True,1227,51933,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",False,275834.00,240895.00,205956.00,...,XF,1,0,OffsiteApply,True,Mid-Senior level,careers.iherb.com,True,0,1
6,Robert Half,True,32197,2609057,Senior Accountant,"Senior Accountant, San Mateo location, Commerc...",False,110000.00,107500.00,105000.00,...,CA,0,35,ComplexOnsiteApply,False,Associate,,False,1,1
8,MasTec Communications Group,True,2382,42211,Tower Technician II,Overview\n\nAt MasTec Communications Group we ...,False,25.00,23.50,22.00,...,CA,0,0,OffsiteApply,True,Not Listed,careers.masteccommunicationsgroup.com,True,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33234,Hadrian,True,100,5926,Quality Engineer,Hadrian - Manufacturing the Future\n\nHadrian ...,False,140000.00,120000.00,100000.00,...,CA,0,6,OffsiteApply,True,Entry level,jobs.lever.co,True,0,1
33237,Visual Lease,True,194,7154,Account Executive - Enterprise,Are You The Right Fit?\n\nVisual Lease is look...,False,150000.00,130000.00,110000.00,...,NJ,1,9,OffsiteApply,True,Mid-Senior level,jobs.lever.co,True,0,1
33238,Gravity IT Resources,True,104,112874,Technical Architect,Job Title: Technical ArchitectLocation: UtahJo...,False,125000.00,125000.00,125000.00,...,XF,0,2,ComplexOnsiteApply,False,Entry level,,False,0,1
33241,Crowe,True,9109,134126,Private Equity Tax Manager,Your Journey at Crowe Starts Here:\n\nAt Crowe...,True,183368.00,136564.00,89760.00,...,MA,0,2,OffsiteApply,True,Not Listed,careers.crowe.com,True,0,1


In [6]:
# Dropping non-important columns that semantically have no relation to salary 
columns_to_drop = ['Company_Page_Listed', 'Views', 'Application_Type', 'Application_Is_Offsite', 'Domain', 'Has_Posting_Domain', 'py_lstd']
df2 = df2.drop(columns=columns_to_drop)

In [7]:
# Dropping null values from dataset

df2 = df2.dropna()

In [8]:
# Checking the different types of posting periods
df2['Posting_Period'].value_counts()

Posting_Period
YEARLY     8004
HOURLY     5035
MONTHLY     224
WEEKLY       82
ONCE          1
Name: count, dtype: int64

In [9]:
# Define conversion factors for each pay cycle
conversion_factors = {'HOURLY': 40 * 52,  # Assuming 40 hours per week and 52 weeks per year
                      'MONTHLY': 12,       # Monthly to yearly
                      'WEEKLY': 52,        # Weekly to yearly 
                      'YEARLY': 1,
                      'Unpaid' : 1,
                      'ONCE': 1}
                        

In [10]:
# Apply conversion to standardize salary to yearly

df2['Min_Salary_Yearly'] = df2.apply(lambda row: row['Min_Salary'] * conversion_factors[row['Posting_Period']], axis=1)

In [41]:
df2['Min_Salary_Yearly'].describe()

count    1.334600e+04
mean     1.255962e+05
std      3.136613e+06
min      1.200000e+01
25%      4.650740e+04
50%      7.000000e+04
75%      1.053008e+05
max      3.120000e+08
Name: Min_Salary_Yearly, dtype: float64

In [12]:
more_columns_to_drop = ['Posting_Period', 'Min_Salary', 'Max_Salary', 'Median_Salary']

df2 = df2.drop(columns=more_columns_to_drop)

In [13]:
df2

Unnamed: 0,Company_Name,Employee_Count,Followers_Count,Job_Title,Job_Description,Is_Supervisor,Work_Type,Location,State_Code,Is_Remote,Experience_Level,Is_Sponsored,Base_Compensation,Min_Salary_Yearly
0,HearingLife,1171,11417,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,False,Full-time,"Little River, SC",SC,0,Entry level,0,1,63000.0
3,Episcopal Communities & Services,36,305,Cook,descriptionTitle\n\n Looking for a great oppor...,False,Full-time,"Aliso Viejo, CA",CA,0,Entry level,0,1,46321.6
4,"iHerb, LLC",1227,51933,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",False,Full-time,United States,XF,1,Mid-Senior level,0,1,205956.0
6,Robert Half,32197,2609057,Senior Accountant,"Senior Accountant, San Mateo location, Commerc...",False,Full-time,"San Mateo, CA",CA,0,Associate,1,1,105000.0
8,MasTec Communications Group,2382,42211,Tower Technician II,Overview\n\nAt MasTec Communications Group we ...,False,Full-time,"Fresno, CA",CA,0,Not Listed,0,1,45760.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33234,Hadrian,100,5926,Quality Engineer,Hadrian - Manufacturing the Future\n\nHadrian ...,False,Full-time,"Los Angeles, CA",CA,0,Entry level,0,1,100000.0
33237,Visual Lease,194,7154,Account Executive - Enterprise,Are You The Right Fit?\n\nVisual Lease is look...,False,Full-time,"Woodbridge, NJ",NJ,1,Mid-Senior level,0,1,110000.0
33238,Gravity IT Resources,104,112874,Technical Architect,Job Title: Technical ArchitectLocation: UtahJo...,False,Contract,Salt Lake City Metropolitan Area,XF,0,Entry level,0,1,125000.0
33241,Crowe,9109,134126,Private Equity Tax Manager,Your Journey at Crowe Starts Here:\n\nAt Crowe...,True,Full-time,"Boston, MA",MA,0,Not Listed,0,1,89760.0


In [14]:
df2["Location"].value_counts()

Location
United States                      904
New York, NY                       576
Los Angeles, CA                    265
Seattle, WA                        194
New York City Metropolitan Area    192
                                  ... 
Seabrook, NH                         1
St Clair Shores, MI                  1
Douglasville, GA                     1
Cardiff-by-the-Sea, CA               1
Fort Stockton, TX                    1
Name: count, Length: 2562, dtype: int64

In [15]:
# Getting number of unique locations in the data
unique_count = df2["Location"].nunique()

print("Number of unique locations:", unique_count)

Number of unique locations: 2562


In [16]:
# Dropping Location column as it is not feasible to convert it into a categorical variable
df2 = df2.drop(columns=['Location'])

In [17]:
#Performing one-hot encoding on State code column and converting it into binary
one_hot_encoded = pd.get_dummies(df['State_Code'])
one_hot_encoded = one_hot_encoded.astype(int)


df2 = pd.concat([df2, one_hot_encoded], axis=1)

df2 = df2.drop(columns=['State_Code'])

In [18]:
df2

Unnamed: 0,Company_Name,Employee_Count,Followers_Count,Job_Title,Job_Description,Is_Supervisor,Work_Type,Is_Remote,Experience_Level,Is_Sponsored,...,TN,TX,UT,VA,VT,WA,WI,WV,WY,XF
0,HearingLife,1171.0,11417.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,False,Full-time,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Episcopal Communities & Services,36.0,305.0,Cook,descriptionTitle\n\n Looking for a great oppor...,False,Full-time,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,0
4,"iHerb, LLC",1227.0,51933.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",False,Full-time,1.0,Mid-Senior level,0.0,...,0,0,0,0,0,0,0,0,0,1
6,Robert Half,32197.0,2609057.0,Senior Accountant,"Senior Accountant, San Mateo location, Commerc...",False,Full-time,0.0,Associate,1.0,...,0,0,0,0,0,0,0,0,0,0
8,MasTec Communications Group,2382.0,42211.0,Tower Technician II,Overview\n\nAt MasTec Communications Group we ...,False,Full-time,0.0,Not Listed,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33239,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
33240,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
33242,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
33244,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1


In [19]:
df2 = df2.dropna()
df2

Unnamed: 0,Company_Name,Employee_Count,Followers_Count,Job_Title,Job_Description,Is_Supervisor,Work_Type,Is_Remote,Experience_Level,Is_Sponsored,...,TN,TX,UT,VA,VT,WA,WI,WV,WY,XF
0,HearingLife,1171.0,11417.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,False,Full-time,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Episcopal Communities & Services,36.0,305.0,Cook,descriptionTitle\n\n Looking for a great oppor...,False,Full-time,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,0
4,"iHerb, LLC",1227.0,51933.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",False,Full-time,1.0,Mid-Senior level,0.0,...,0,0,0,0,0,0,0,0,0,1
6,Robert Half,32197.0,2609057.0,Senior Accountant,"Senior Accountant, San Mateo location, Commerc...",False,Full-time,0.0,Associate,1.0,...,0,0,0,0,0,0,0,0,0,0
8,MasTec Communications Group,2382.0,42211.0,Tower Technician II,Overview\n\nAt MasTec Communications Group we ...,False,Full-time,0.0,Not Listed,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33234,Hadrian,100.0,5926.0,Quality Engineer,Hadrian - Manufacturing the Future\n\nHadrian ...,False,Full-time,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,0
33237,Visual Lease,194.0,7154.0,Account Executive - Enterprise,Are You The Right Fit?\n\nVisual Lease is look...,False,Full-time,1.0,Mid-Senior level,0.0,...,0,0,0,0,0,0,0,0,0,0
33238,Gravity IT Resources,104.0,112874.0,Technical Architect,Job Title: Technical ArchitectLocation: UtahJo...,False,Contract,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,1
33241,Crowe,9109.0,134126.0,Private Equity Tax Manager,Your Journey at Crowe Starts Here:\n\nAt Crowe...,True,Full-time,0.0,Not Listed,0.0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Checking different types of work types

df2['Work_Type'].value_counts()

Work_Type
Full-time     11183
Contract       1306
Part-time       633
Temporary       118
Internship       63
Other            43
Name: count, dtype: int64

In [21]:
# Removing volunteer type positions 

df2=df2.loc[df2.Work_Type!="Volunteer"]

In [22]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "Work_Type" column
encoded_values = {'Full-time': 6, 'Contract': 5, 'Part-time': 4, 'Temporary': 3, 'Internship': 2, 'Other': 1}
df2['Work_Type'] = df2['Work_Type'].map(encoded_values)

df2

Unnamed: 0,Company_Name,Employee_Count,Followers_Count,Job_Title,Job_Description,Is_Supervisor,Work_Type,Is_Remote,Experience_Level,Is_Sponsored,...,TN,TX,UT,VA,VT,WA,WI,WV,WY,XF
0,HearingLife,1171.0,11417.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,False,6,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Episcopal Communities & Services,36.0,305.0,Cook,descriptionTitle\n\n Looking for a great oppor...,False,6,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,0
4,"iHerb, LLC",1227.0,51933.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",False,6,1.0,Mid-Senior level,0.0,...,0,0,0,0,0,0,0,0,0,1
6,Robert Half,32197.0,2609057.0,Senior Accountant,"Senior Accountant, San Mateo location, Commerc...",False,6,0.0,Associate,1.0,...,0,0,0,0,0,0,0,0,0,0
8,MasTec Communications Group,2382.0,42211.0,Tower Technician II,Overview\n\nAt MasTec Communications Group we ...,False,6,0.0,Not Listed,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33234,Hadrian,100.0,5926.0,Quality Engineer,Hadrian - Manufacturing the Future\n\nHadrian ...,False,6,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,0
33237,Visual Lease,194.0,7154.0,Account Executive - Enterprise,Are You The Right Fit?\n\nVisual Lease is look...,False,6,1.0,Mid-Senior level,0.0,...,0,0,0,0,0,0,0,0,0,0
33238,Gravity IT Resources,104.0,112874.0,Technical Architect,Job Title: Technical ArchitectLocation: UtahJo...,False,5,0.0,Entry level,0.0,...,0,0,0,0,0,0,0,0,0,1
33241,Crowe,9109.0,134126.0,Private Equity Tax Manager,Your Journey at Crowe Starts Here:\n\nAt Crowe...,True,6,0.0,Not Listed,0.0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
df2['Experience_Level'].value_counts()

Experience_Level
Mid-Senior level    4638
Not Listed          3557
Entry level         2987
Associate           1336
Director             574
Executive            133
Internship           121
Name: count, dtype: int64

In [24]:
# Define a mapping dictionary with custom encoding order
encoded_values = {'Not Listed': 0, 'Internship': 1, 'Entry level': 2, 'Associate': 3, 'Mid-Senior level': 4, 'Executive': 5, 'Director': 6}

df2['Experience_Level'] = df2['Experience_Level'].map(encoded_values)

df2

Unnamed: 0,Company_Name,Employee_Count,Followers_Count,Job_Title,Job_Description,Is_Supervisor,Work_Type,Is_Remote,Experience_Level,Is_Sponsored,...,TN,TX,UT,VA,VT,WA,WI,WV,WY,XF
0,HearingLife,1171.0,11417.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,False,6,0.0,2,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Episcopal Communities & Services,36.0,305.0,Cook,descriptionTitle\n\n Looking for a great oppor...,False,6,0.0,2,0.0,...,0,0,0,0,0,0,0,0,0,0
4,"iHerb, LLC",1227.0,51933.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",False,6,1.0,4,0.0,...,0,0,0,0,0,0,0,0,0,1
6,Robert Half,32197.0,2609057.0,Senior Accountant,"Senior Accountant, San Mateo location, Commerc...",False,6,0.0,3,1.0,...,0,0,0,0,0,0,0,0,0,0
8,MasTec Communications Group,2382.0,42211.0,Tower Technician II,Overview\n\nAt MasTec Communications Group we ...,False,6,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33234,Hadrian,100.0,5926.0,Quality Engineer,Hadrian - Manufacturing the Future\n\nHadrian ...,False,6,0.0,2,0.0,...,0,0,0,0,0,0,0,0,0,0
33237,Visual Lease,194.0,7154.0,Account Executive - Enterprise,Are You The Right Fit?\n\nVisual Lease is look...,False,6,1.0,4,0.0,...,0,0,0,0,0,0,0,0,0,0
33238,Gravity IT Resources,104.0,112874.0,Technical Architect,Job Title: Technical ArchitectLocation: UtahJo...,False,5,0.0,2,0.0,...,0,0,0,0,0,0,0,0,0,1
33241,Crowe,9109.0,134126.0,Private Equity Tax Manager,Your Journey at Crowe Starts Here:\n\nAt Crowe...,True,6,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Converting Supervisor column to binary
df2['Is_Supervisor'] = df2['Is_Supervisor'].astype(int)

In [26]:
df2

Unnamed: 0,Company_Name,Employee_Count,Followers_Count,Job_Title,Job_Description,Is_Supervisor,Work_Type,Is_Remote,Experience_Level,Is_Sponsored,...,TN,TX,UT,VA,VT,WA,WI,WV,WY,XF
0,HearingLife,1171.0,11417.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,0,6,0.0,2,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Episcopal Communities & Services,36.0,305.0,Cook,descriptionTitle\n\n Looking for a great oppor...,0,6,0.0,2,0.0,...,0,0,0,0,0,0,0,0,0,0
4,"iHerb, LLC",1227.0,51933.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",0,6,1.0,4,0.0,...,0,0,0,0,0,0,0,0,0,1
6,Robert Half,32197.0,2609057.0,Senior Accountant,"Senior Accountant, San Mateo location, Commerc...",0,6,0.0,3,1.0,...,0,0,0,0,0,0,0,0,0,0
8,MasTec Communications Group,2382.0,42211.0,Tower Technician II,Overview\n\nAt MasTec Communications Group we ...,0,6,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33234,Hadrian,100.0,5926.0,Quality Engineer,Hadrian - Manufacturing the Future\n\nHadrian ...,0,6,0.0,2,0.0,...,0,0,0,0,0,0,0,0,0,0
33237,Visual Lease,194.0,7154.0,Account Executive - Enterprise,Are You The Right Fit?\n\nVisual Lease is look...,0,6,1.0,4,0.0,...,0,0,0,0,0,0,0,0,0,0
33238,Gravity IT Resources,104.0,112874.0,Technical Architect,Job Title: Technical ArchitectLocation: UtahJo...,0,5,0.0,2,0.0,...,0,0,0,0,0,0,0,0,0,1
33241,Crowe,9109.0,134126.0,Private Equity Tax Manager,Your Journey at Crowe Starts Here:\n\nAt Crowe...,1,6,0.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Categorizing the job_title variable
# Define a function to categorize job titles
def categorize_job_title(title):
    title_lower = title.lower()  # Convert to lowercase for case-insensitive matching

    # Check for keywords to assign broader categories
    if 'engineer' in title_lower:
        return 'Engineering'
    elif 'manager' in title_lower:
        return 'Management'
    elif 'nurse' in title_lower:
        return 'Nursing'
    elif 'analyst' in title_lower:
        return 'Analyst'
    elif 'developer' in title_lower:
        return 'Developer'
    elif 'technician' in title_lower:
        return 'Technician'
    elif 'specialist' in title_lower:
        return 'Specialist'
    elif 'consultant' in title_lower:
        return 'Consultant'
    elif 'assistant' in title_lower:
        return 'Assistant'
    elif 'coordinator' in title_lower:
        return 'Coordinator'
    elif 'supervisor' in title_lower:
        return 'Supervisor'
    elif 'designer' in title_lower:
        return 'Designer'
    elif 'research' in title_lower:
        return 'Research'
    elif 'representative' in title_lower:
        return 'Representative'
    elif 'administrator' in title_lower:
        return 'Administrator'
    elif 'analyst' in title_lower:
        return 'Analyst'
    elif 'architect' in title_lower:
        return 'Architect'
    elif 'sales' in title_lower:
        return 'Sales'
    elif 'technologist' in title_lower:
        return 'Technologist'
    else:
        return 'Other' if len(title.split()) > 3 else 'Reduced_Other'

# Apply the function to create a new column with categorized job titles
df2['Categorized_Job_Title'] = df2['Job_Title'].apply(categorize_job_title)

# Check the unique values in the new column
print(df2['Categorized_Job_Title'].unique())

['Reduced_Other' 'Architect' 'Technician' 'Analyst' 'Management' 'Other'
 'Representative' 'Sales' 'Coordinator' 'Nursing' 'Engineering'
 'Specialist' 'Assistant' 'Administrator' 'Consultant' 'Designer'
 'Supervisor' 'Technologist' 'Research' 'Developer']


In [28]:
# Print the count values of all categories
category_counts = df2['Categorized_Job_Title'].value_counts()
print(category_counts)

Categorized_Job_Title
Reduced_Other     2581
Other             2335
Management        2186
Engineering       1329
Specialist         749
Analyst            676
Assistant          508
Technician         466
Sales              445
Representative     348
Nursing            346
Coordinator        309
Supervisor         204
Consultant         184
Developer          181
Designer           138
Administrator      124
Architect          116
Research            81
Technologist        40
Name: count, dtype: int64


In [29]:
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

# Function to extract and count words in job titles
def count_words_in_category(category):
    words = ' '.join(df2[df2['Categorized_Job_Title'] == category]['Job_Title']).lower().split()
    word_counts = Counter(words)
    return word_counts

# Get word counts for "Other" category
other_word_counts = count_words_in_category('Other')
print("Word counts in 'Other' category:")
print(other_word_counts.most_common(50))  # Adjust the number as needed

print("\n-----------------------------\n")

# Get word counts for "Reduced_Other" category
reduced_other_word_counts = count_words_in_category('Reduced_Other')
print("Word counts in 'Reduced_Other' category:")
print(reduced_other_word_counts.most_common(50))  # Adjust the number as needed

Word counts in 'Other' category:
[('-', 1062), ('associate', 229), ('of', 211), ('director', 183), ('senior', 180), ('&', 174), ('and', 136), ('shift', 107), ('director,', 107), ('time', 91), ('support', 91), ('lead', 89), ('driver', 84), ('business', 79), ('in', 74), ('remote', 71), ('rn', 70), ('2024', 67), ('operator', 66), ('center', 64), ('tax', 63), ('service', 60), ('intern', 60), ('health', 60), ('part', 59), ('executive', 59), ('staff', 58), ('account', 57), ('to', 56), ('/', 54), ('rep', 53), ('operations', 53), ('i', 52), ('|', 51), ('vice', 51), ('services', 51), ('delivery', 49), ('customer', 49), ('team', 49), ('care', 49), ('development', 48), ('call', 48), ('management', 48), ('cdl', 45), ('a', 45), ('truck', 44), ('retail', 44), ('summer', 43), ('data', 42), ('handler', 42)]

-----------------------------

Word counts in 'Reduced_Other' category:
[('accountant', 209), ('associate', 198), ('senior', 163), ('director', 133), ('executive', 87), ('attorney', 82), ('staff',

In [30]:
# Define a function to recategorize job titles
def recategorize_job_title(title):
    title_lower = title.lower()  # Convert to lowercase for case-insensitive matching

    # Check for keywords to assign broader categories
    if 'engineer' in title_lower:
        return 'Engineering'
    elif 'manager' in title_lower:
        return 'Management'
    elif 'nurse' in title_lower:
        return 'Nursing'
    elif 'analyst' in title_lower:
        return 'Analyst'
    elif 'developer' in title_lower:
        return 'Developer'
    elif 'technician' in title_lower:
        return 'Technician'
    elif 'specialist' in title_lower:
        return 'Specialist'
    elif 'consultant' in title_lower:
        return 'Consultant'
    elif 'assistant' in title_lower:
        return 'Assistant'
    elif 'coordinator' in title_lower:
        return 'Coordinator'
    elif 'supervisor' in title_lower:
        return 'Supervisor'
    elif 'designer' in title_lower:
        return 'Designer'
    elif 'research' in title_lower:
        return 'Research'
    elif 'representative' in title_lower:
        return 'Representative'
    elif 'administrator' in title_lower:
        return 'Administrator'
    elif 'analyst' in title_lower:
        return 'Analyst'
    elif 'architect' in title_lower:
        return 'Architect'
    elif 'sales' in title_lower:
        return 'Sales'
    elif 'technologist' in title_lower:
        return 'Technologist'
    elif 'receptionist' in title_lower:
        return 'Other' if 'analyst' in title_lower else 'Assistant'
    elif 'customer service' in title_lower:
        return 'Other' if 'representative' in title_lower else 'Sales'
    elif 'associate' in title_lower:
        return 'Reduced_Other' if 'director' in title_lower else 'Sales'
    elif 'accountant' in title_lower:
        return 'Reduced_Other' if 'senior' in title_lower else 'Accounting'
    elif 'senior' in title_lower:
        return 'Other' if 'director' in title_lower else 'Senior'
    elif 'director' in title_lower:
        return 'Other' if 'executive' in title_lower else 'Director'
    elif 'executive' in title_lower:
        return 'Reduced_Other' if 'lead' in title_lower else 'Executive'
    elif 'lead' in title_lower:
        return 'Reduced_Other' if 'rn' in title_lower else 'Lead'
    elif 'rn' in title_lower:
        return 'Reduced_Other' if 'operator' in title_lower else 'Nursing'
    elif 'operator' in title_lower:
        return 'Reduced_Other' if 'attorney' in title_lower else 'Operator'
    elif 'attorney' in title_lower:
        return 'Reduced_Other' if 'account' in title_lower else 'Legal'
    elif 'account' in title_lower:
        return 'Reduced_Other' if 'analyst' in title_lower else 'Accounting'
    elif 'associate' in title_lower:
        return 'Reduced_Other' if 'director' in title_lower else 'Sales'
    elif 'of' in title_lower:
        return 'Reduced_Other' if 'director' in title_lower else 'Other'
    elif 'tech' in title_lower:
        return 'Other' if 'director' in title_lower else 'Technical'
    elif 'business' in title_lower:
        return 'Other' if 'director' in title_lower else 'Business'
    elif 'service' in title_lower:
        return 'Other' if 'director' in title_lower else 'Service'
    elif 'operations' in title_lower:
        return 'Other' if 'director' in title_lower else 'Operations'
    elif 'driver' in title_lower:
        return 'Other' if 'director' in title_lower else 'Driver'
    elif 'tax' in title_lower:
        return 'Reduced_Other' if 'director' in title_lower else 'Tax'
    elif 'support' in title_lower:
        return 'Other' if 'director' in title_lower else 'Support'
    elif 'job' in title_lower:
        return 'Other' if 'director' in title_lower else 'Job'
    elif 'make' in title_lower:
        return 'Other' if 'director' in title_lower else 'Make'
    elif 'to' in title_lower:
        return 'Other' if 'director' in title_lower else 'To'
    elif 'health' in title_lower:
        return 'Other' if 'director' in title_lower else 'Health'
    elif '/' in title_lower:
        return 'Other' if 'director' in title_lower else 'Slash'
    elif 'remote' in title_lower:
        return 'Other' if 'director' in title_lower else 'Remote'
    elif 'care' in title_lower:
        return 'Other' if 'director' in title_lower else 'Care'
    elif 'travel' in title_lower:
        return 'Other' if 'director' in title_lower else 'Travel'
    elif 'executive' in title_lower:
        return 'Reduced_Other' if 'lead' in title_lower else 'Executive'
    elif 'part' in title_lower:
        return 'Other' if 'director' in title_lower else 'Part'
    elif 'team' in title_lower:
        return 'Other' if 'director' in title_lower else 'Team'
    elif 'operator' in title_lower:
        return 'Reduced_Other' if 'attorney' in title_lower else 'Operator'
    elif 'management' in title_lower:
        return 'Other' if 'director' in title_lower else 'Management'
    elif 'intern' in title_lower:
        return 'Other' if 'director' in title_lower else 'Intern'
    elif 'services' in title_lower:
        return 'Other' if 'director' in title_lower else 'Services'
    elif 'staff' in title_lower:
        return 'Reduced_Other' if 'director' in title_lower else 'Staff'
    elif 'development' in title_lower:
        return 'Other' if 'director' in title_lower else 'Development'
    elif 'center' in title_lower:
        return 'Other' if 'director' in title_lower else 'Center'
    elif '2024' in title_lower:
        return 'Other' if 'director' in title_lower else '2024'
    elif 'account' in title_lower:
        return 'Reduced_Other' if 'analyst' in title_lower else 'Account'
    elif 'data' in title_lower:
        return 'Other' if 'director' in title_lower else 'Data'
    elif 'i' in title_lower:
        return 'Other' if 'director' in title_lower else 'I'
    elif 'full' in title_lower:
        return 'Other' if 'director' in title_lower else 'Full'
    elif 'advisor' in title_lower:
        return 'Reduced_Other' if 'analyst' in title_lower else 'Advisor'
    elif 'professional' in title_lower:
        return 'Other' if 'director' in title_lower else 'Professional'
    elif 'medical' in title_lower:
        return 'Other' if 'director' in title_lower else 'Medical'
    elif 'vice' in title_lower:
        return 'Other' if 'director' in title_lower else 'Vice'
    elif '|' in title_lower:
        return 'Other' if 'director' in title_lower else 'Pipe'
    elif 'ii' in title_lower:
        return 'Other' if 'director' in title_lower else 'II'
    elif 'customer' in title_lower:
        return 'Other' if 'representative' in title_lower else 'Customer'
    else:
        return 'Other' if len(title.split()) > 3 else 'Reduced_Other'

# Apply the function to create a new column with recategorized job titles
df2['Recategorized_Job_Title'] = df2['Job_Title'].apply(recategorize_job_title)

# Check the unique values in the new column
print(df2['Recategorized_Job_Title'].unique())

['Care' 'Reduced_Other' 'Architect' 'Technician' 'I' 'Analyst'
 'Management' 'To' 'Other' 'Representative' 'Sales' 'Nursing'
 'Coordinator' 'Driver' 'Operator' 'Engineering' 'Slash' 'Staff'
 'Specialist' 'Director' 'Senior' 'Assistant' 'Administrator' 'Lead'
 'Accounting' 'Technical' 'Remote' 'Consultant' 'Designer' 'Health'
 'Center' 'Part' 'Team' 'Supervisor' 'Operations' 'Executive'
 'Technologist' 'Support' 'Development' 'Tax' 'Research' 'Developer'
 'Data' 'Business' 'Job' 'Service' 'Make' 'Full' '2024']


In [31]:
df2 = df2.drop(columns=['Company_Name', 'Job_Title', 'Categorized_Job_Title', 'Job_Description'])

In [32]:
#Performing one-hot encoding on recategorized job title column and converting it into binary
one_hot_encoded = pd.get_dummies(df2['Recategorized_Job_Title'])
one_hot_encoded = one_hot_encoded.astype(int)


df2 = pd.concat([df2, one_hot_encoded], axis=1)

df2 = df2.drop(columns=['Recategorized_Job_Title'])

df2

Unnamed: 0,Employee_Count,Followers_Count,Is_Supervisor,Work_Type,Is_Remote,Experience_Level,Is_Sponsored,Base_Compensation,Min_Salary_Yearly,AK,...,Specialist,Staff,Supervisor,Support,Tax,Team,Technical,Technician,Technologist,To
0,1171.0,11417.0,0,6,0.0,2,0.0,1.0,63000.0,0,...,0,0,0,0,0,0,0,0,0,0
3,36.0,305.0,0,6,0.0,2,0.0,1.0,46321.6,0,...,0,0,0,0,0,0,0,0,0,0
4,1227.0,51933.0,0,6,1.0,4,0.0,1.0,205956.0,0,...,0,0,0,0,0,0,0,0,0,0
6,32197.0,2609057.0,0,6,0.0,3,1.0,1.0,105000.0,0,...,0,0,0,0,0,0,0,0,0,0
8,2382.0,42211.0,0,6,0.0,0,0.0,1.0,45760.0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33234,100.0,5926.0,0,6,0.0,2,0.0,1.0,100000.0,0,...,0,0,0,0,0,0,0,0,0,0
33237,194.0,7154.0,0,6,1.0,4,0.0,1.0,110000.0,0,...,0,0,0,0,0,0,0,0,0,0
33238,104.0,112874.0,0,5,0.0,2,0.0,1.0,125000.0,0,...,0,0,0,0,0,0,0,0,0,0
33241,9109.0,134126.0,1,6,0.0,0,0.0,1.0,89760.0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
#Splitting the dataset into X and y

y_data = df2['Min_Salary_Yearly']

X_data = df2.drop('Min_Salary_Yearly', axis=1)


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

In [35]:
#### Reset index of X, y data in Train, Validation, test sets

print("\n************** Resetting Index **************")

# Train Data
X_train=X_train.reset_index(drop=True)
y_train=y_train.reset_index(drop=True)


# Test Data
X_test=X_test.reset_index(drop=True)
y_test=y_test.reset_index(drop=True)


************** Resetting Index **************


In [36]:
#Performing Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_lr = lr_model.predict(X_test)

# Evaluate the model
mae_lr = mean_absolute_error(y_test, y_pred_lr)

print("Linear Regression Mean Absolute Error:", mae_lr)

Linear Regression Mean Absolute Error: 166407.3239204733


In [37]:
#Perorming Decision Tree
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
mae_dt = mean_absolute_error(y_test, y_pred_dt)
print("Decision Tree Mean Absolute Error:", mae_dt)


Decision Tree Mean Absolute Error: 202099.7242785101


In [38]:
#Performing Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("Random Forest Mean Absolute Error:", mae_rf)


Random Forest Mean Absolute Error: 116017.83711349334
