# Glassdoor salary prediction

## import packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

In [3]:
data = pd.read_csv('salary_data_cleaned.csv')
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,avg_salary,company_txt,job_state,same_state,age,python_yn,R_yn,spark,aws,excel
0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,72.0,Tecolote Research\n,NM,0,47,1,0,0,0,1
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,87.5,University of Maryland Medical System\n,MD,0,36,1,0,0,0,0
2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,85.0,KnowBe4\n,FL,1,10,1,0,1,0,1
3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,76.5,PNNL\n,WA,1,55,1,0,0,0,0
4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,114.5,Affinity Solutions\n,NY,1,22,1,0,0,0,1


In [4]:
data.describe()

Unnamed: 0,Rating,Founded,hourly,employer_provided,min_salary,max_salary,avg_salary,same_state,age,python_yn,R_yn,spark,aws,excel
count,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0
mean,3.618868,1837.154987,0.032345,0.022911,74.068733,127.183288,100.626011,0.557951,46.591644,0.528302,0.002695,0.225067,0.237197,0.522911
std,0.80121,497.183763,0.177034,0.149721,31.869282,46.909006,38.855948,0.496965,53.778815,0.499535,0.051882,0.417908,0.425651,0.499812
min,-1.0,-1.0,0.0,0.0,10.0,16.0,13.5,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
25%,3.3,1939.0,0.0,0.0,52.0,96.0,73.5,0.0,11.0,0.0,0.0,0.0,0.0,0.0
50%,3.7,1988.0,0.0,0.0,69.5,124.0,97.5,1.0,24.0,1.0,0.0,0.0,0.0,1.0
75%,4.0,2007.0,0.0,0.0,91.0,155.0,122.5,1.0,59.0,1.0,0.0,0.0,0.0,1.0
max,5.0,2019.0,1.0,1.0,202.0,306.0,254.0,1.0,276.0,1.0,1.0,1.0,1.0,1.0


## checking for null values

In [5]:
data['Rating'].isnull().values.any()
data['age'].isnull().values.any()
data['Founded'].isnull().values.any()


False

## Describing the features and target variables

In [6]:
data.columns.values

array(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'Competitors', 'hourly', 'employer_provided', 'min_salary',
       'max_salary', 'avg_salary', 'company_txt', 'job_state',
       'same_state', 'age', 'python_yn', 'R_yn', 'spark', 'aws', 'excel'],
      dtype=object)

In [7]:
x = data[['Rating', 'age', 'Founded']]
y = data['avg_salary']

In [8]:
reg = LinearRegression()

In [9]:
x.shape

(742, 3)

In [10]:
reg.fit(x,y)

In [11]:
reg.coef_

array([ 1.03953334,  0.0163305 , -0.00197957])

In [12]:
reg.intercept_

99.73998292839859

In [13]:
reg.score(x,y)

0.0011171724942857786

## adjustable R^2 formula..(obs: my adjusted r2 is -ve)
$R^2_{adj} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [14]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

-0.0029433268045180316

## feature selection

#### These step will help us find the p-value looks Hence we can know which input has an impact on our model.

In [15]:
from sklearn.feature_selection import f_regression

f_regression(x,y)

In [16]:
p_values = f_regression(x,y)[1].round(3)
p_values

array([0.714, 0.593, 0.68 ])

## creating a summary table

In [17]:
reg_summary = pd.DataFrame(data = x.columns.values, columns = ['features'])
reg_summary['coefficient'] = reg.coef_
reg_summary['p-values'] = p_values
reg_summary

Unnamed: 0,features,coefficient,p-values
0,Rating,1.039533,0.714
1,age,0.016331,0.593
2,Founded,-0.00198,0.68


## Feature Scaling

#### feature scaling is necessary for normalizing our features/inputs. If there is discrepancy/wide gap, with scaling we can reduce the figures. If we fit the model with the unnormalized features will might get a wrong observation as which input has a great impact on the model.

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler = StandardScaler()

In [20]:
#let fit the scaler with the features
scaler.fit(x).get_params()


{'copy': True, 'with_mean': True, 'with_std': True}

In [21]:
#let transform the scaled features
x_scaled = scaler.transform(x)
x_scaled

array([[ 0.22622561,  0.00759837,  0.27341329],
       [-0.27335594, -0.19708111,  0.29555283],
       [ 1.47517949, -0.68086897,  0.34788265],
       ...,
       [-1.27251905, -0.19708111,  0.29555283],
       [-0.52314672, -0.88554845, -3.69962784],
       [-0.02356517,  0.11924172,  0.26133717]])

In [38]:
print(f'rating, age, founded ,mean:',scaler.mean_)
print(f'rating, age, founded, variance:',scaler.var_)

rating, age, founded ,mean: [   3.61886792   46.5916442  1837.15498652]
rating, age, founded, variance: [6.41072573e-01 2.88826316e+03 2.46858551e+05]


## Regression with the scaled features

In [22]:
#seting the linear regression model
reg = LinearRegression()

In [23]:
# let fit the model with the scaled features and target
reg.fit(x_scaled,y).get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [24]:
reg.coef_


array([ 0.83232324,  0.87764298, -0.98354499])

In [25]:
#let get the intercept/bias
reg.intercept_

100.62601078167116

## summary table
#### weight can also be the coefficient. the bigger the weight, the bigger the impact. The intercept is the bais which is just a number that adjust our regreession with some constant.


##### sklearn doesn't support p-value since most ML engineers perform feature scaling before feeding it to the model. With feature scaling we don't have to worry if a feature contributes to the regression or not. because scaling will reduce redundance feature to weight approximately zero.

In [26]:
reg_summary = pd.DataFrame([['Bias'], ['Rating'], ['age'], [ 'Founded']], columns =['features'])

reg_summary['weight'] = reg.intercept_, reg.coef_[0], reg.coef_[1], reg.coef_[2]

In [27]:
reg_summary

Unnamed: 0,features,weight
0,Bias,100.626011
1,Rating,0.832323
2,age,0.877643
3,Founded,-0.983545


## making predictions with the standardized coefficient(weights)
#### remember to standardize the testing data to be used for prediction.

## I'm just using random data. the data names and arrangement should match the one used during fitting.

#### note: we used the scaler we used for training also for prediction. The concept is that the model uses the mean of the training data to predict that of the testing data. That is why the arrangement and spelling of the training and testing data must be same

#### Also notice we are not fitting the data again. we just make use of the fitting we did with the training data to predict the testing data

In [34]:
new_data = pd.DataFrame(data =[[3.8, 40, 2004], [4.2, 35, 1999]], columns = ["Rating", "age","Founded"])
new_data

Unnamed: 0,Rating,age,Founded
0,3.8,40,2004
1,4.2,35,1999


In [35]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled


array([[ 0.22622561, -0.12265221,  0.33580653],
       [ 0.72580716, -0.21568834,  0.32574311]])

reg.predict(new_data_scaled)

### overfitting occurs when the model focused on the particular training set so much that it has missed the point(the noises are captured in an over fitting model) In the case of overfitting we can split the model into two(training and testing).Underfiting is when the model has not captured the underlying logic of the data(has low predicting power) and it is easy to spot because it has poor accuracy. 