### Import Libraries

In [217]:
import warnings
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

## Load Datasets

In [218]:
df = pd.read_csv('files/salary_data_cleaned.csv')

## Exploratory Data Analysis

In [219]:
df.columns

Index(['job_title', 'salary_estimate', 'job_description', 'rating',
       'company_name', 'location', 'headquarters', 'size', 'founded',
       'type_of_ownership', 'industry', 'sector', 'revenue', 'competitors',
       'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'job_state', 'same_state', 'age', 'python', 'aws', 'spark', 'excel',
       'job_simp', 'seniority', 'desc_len', 'num_comp'],
      dtype='object')

In [220]:
df.head()

Unnamed: 0,job_title,salary_estimate,job_description,rating,company_name,location,headquarters,size,founded,type_of_ownership,industry,sector,revenue,competitors,hourly,employer_provided,min_salary,max_salary,avg_salary,job_state,same_state,age,python,aws,spark,excel,job_simp,seniority,desc_len,num_comp
0,Data Scientist,53-91,"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,0,0,53,91,72.0,NM,0,48,1,0,0,1,data scientist,na,2536,0
1,Healthcare Data Scientist,63-112,What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,0,0,63,112,87.5,MD,0,37,1,0,0,0,data scientist,na,4783,0
2,Data Scientist,80-90,"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,0,0,80,90,85.0,FL,1,11,1,0,1,1,data scientist,na,3461,0
3,Data Scientist,56-97,*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",0,0,56,97,76.5,WA,1,56,1,0,0,0,data scientist,na,3883,3
4,Data Scientist,86-143,Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",0,0,86,143,114.5,NY,1,23,1,0,0,1,data scientist,na,2728,3


In [247]:
df.shape

(742, 30)

## define the type of the features 

In [221]:
drop_columns = [ 'industry', 'sector', 'job_state', 'job_title', 'min_salary', 'max_salary', 'salary_estimate', 'job_description', 'company_name', 'location',  'size' , 'headquarters', 'revenue', 'competitors', 'hourly']
one_hot_encode_columns = [ 'type_of_ownership','job_simp', 'seniority']
scalling_columns = ['founded', 'age', 'desc_len']

In [222]:
df_droped = df.drop(columns=drop_columns, axis =1)
df_droped.columns

Index(['rating', 'founded', 'type_of_ownership', 'employer_provided',
       'avg_salary', 'same_state', 'age', 'python', 'aws', 'spark', 'excel',
       'job_simp', 'seniority', 'desc_len', 'num_comp'],
      dtype='object')

In [223]:
# Converting some columns to 'category' datatype rto make it wasy to use One Hot Encoder

# df_droped['industry'] = df_droped['industry'].astype('category')
# df_droped['sector'] = df_droped['sector'].astype('category')
# df_droped['job_state'] = df_droped['job_state'].astype('category')
df_droped['job_simp'] = df_droped['job_simp'].astype('category')
df_droped['job_simp'] = df_droped['job_simp'].astype('category')

In [224]:
df_droped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742 entries, 0 to 741
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   rating             742 non-null    float64 
 1   founded            742 non-null    int64   
 2   type_of_ownership  742 non-null    object  
 3   employer_provided  742 non-null    int64   
 4   avg_salary         742 non-null    float64 
 5   same_state         742 non-null    int64   
 6   age                742 non-null    int64   
 7   python             742 non-null    int64   
 8   aws                742 non-null    int64   
 9   spark              742 non-null    int64   
 10  excel              742 non-null    int64   
 11  job_simp           742 non-null    category
 12  seniority          742 non-null    object  
 13  desc_len           742 non-null    int64   
 14  num_comp           742 non-null    int64   
dtypes: category(1), float64(2), int64(10), object(2)
memory u

In [225]:
# devide the data to X and y

X = df_droped.drop('avg_salary', axis=1)
y = df_droped.avg_salary

## Spling the data

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [227]:
X_train.columns

Index(['rating', 'founded', 'type_of_ownership', 'employer_provided',
       'same_state', 'age', 'python', 'aws', 'spark', 'excel', 'job_simp',
       'seniority', 'desc_len', 'num_comp'],
      dtype='object')

## Bulding the Pipeline

In [229]:

ohe = OneHotEncoder()
imp = SimpleImputer()

In [230]:
ct = make_column_transformer(
    (ohe, one_hot_encode_columns),  
    (imp, scalling_columns), 
    remainder='passthrough') 

In [231]:
ct.fit_transform(X_train)[65]

array([0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 1.985e+03, 3.600e+01, 1.739e+03,
       3.500e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00])

In [232]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
pipe = Pipeline([('preprocessor', ct), ('classifier', reg)])

In [233]:
X_train.shape , X_test.shape , y_train.shape, y_test.shape

((593, 14), (149, 14), (593,), (149,))

In [234]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['type_of_ownership',
                                                   'job_simp', 'seniority']),
                                                 ('simpleimputer',
                                                  SimpleImputer(),
                                                  ['founded', 'age',
                                                   'desc_len'])])),
                ('classifier', RandomForestRegressor())])

In [235]:
y_pred = pipe.predict(X_test)

In [236]:
pipe.score(X_test, y_test)

0.7216316329557104

In [237]:
# kfold = KFold(shuffle=True)
# cross_val_score(pipe, X_train, y_train, cv=kfold)

In [238]:
mse = mean_squared_error(y_test, y_pred)
mse

383.5382206375839

In [239]:
mae = mean_absolute_error(y_test, y_pred)
mae

13.256946308724835

In [246]:
import joblib
filename = 'files/finalized_model.sav'
joblib.dump(pipe, filename)

['files/finalized_model.sav']

In [241]:
with open(filename, 'rb') as file:
    pipe = joblib.load(file)
    print(pipe)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['type_of_ownership',
                                                   'job_simp', 'seniority']),
                                                 ('simpleimputer',
                                                  SimpleImputer(),
                                                  ['founded', 'age',
                                                   'desc_len'])])),
                ('classifier', RandomForestRegressor())])
