# Glassdoor salary prediction

## import packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

In [4]:
data = pd.read_csv('salary_data_cleaned.csv')
data.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,avg_salary,company_txt,job_state,same_state,age,python_yn,R_yn,spark,aws,excel
0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,72.0,Tecolote Research\n,NM,0,47,1,0,0,0,1
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,87.5,University of Maryland Medical System\n,MD,0,36,1,0,0,0,0
2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,85.0,KnowBe4\n,FL,1,10,1,0,1,0,1
3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,76.5,PNNL\n,WA,1,55,1,0,0,0,0
4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,114.5,Affinity Solutions\n,NY,1,22,1,0,0,0,1


In [5]:
data.describe()

Unnamed: 0,Rating,Founded,hourly,employer_provided,min_salary,max_salary,avg_salary,same_state,age,python_yn,R_yn,spark,aws,excel
count,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0,742.0
mean,3.618868,1837.154987,0.032345,0.022911,74.068733,127.183288,100.626011,0.557951,46.591644,0.528302,0.002695,0.225067,0.237197,0.522911
std,0.80121,497.183763,0.177034,0.149721,31.869282,46.909006,38.855948,0.496965,53.778815,0.499535,0.051882,0.417908,0.425651,0.499812
min,-1.0,-1.0,0.0,0.0,10.0,16.0,13.5,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
25%,3.3,1939.0,0.0,0.0,52.0,96.0,73.5,0.0,11.0,0.0,0.0,0.0,0.0,0.0
50%,3.7,1988.0,0.0,0.0,69.5,124.0,97.5,1.0,24.0,1.0,0.0,0.0,0.0,1.0
75%,4.0,2007.0,0.0,0.0,91.0,155.0,122.5,1.0,59.0,1.0,0.0,0.0,0.0,1.0
max,5.0,2019.0,1.0,1.0,202.0,306.0,254.0,1.0,276.0,1.0,1.0,1.0,1.0,1.0


## checking for null values

In [23]:
data['Rating'].isnull().values.any()
data['age'].isnull().values.any()
data['Founded'].isnull().values.any()


False

## Describing the features and target variables

In [7]:
data.columns.values

array(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'Competitors', 'hourly', 'employer_provided', 'min_salary',
       'max_salary', 'avg_salary', 'company_txt', 'job_state',
       'same_state', 'age', 'python_yn', 'R_yn', 'spark', 'aws', 'excel'],
      dtype=object)

In [25]:
x = data[['Rating', 'age', 'Founded']]
y = data['avg_salary']

In [26]:
reg = LinearRegression()

In [27]:
x.shape

(742, 3)

In [28]:
reg.fit(x,y)

LinearRegression()

In [29]:
reg.coef_

array([ 1.03953334,  0.0163305 , -0.00197957])

In [30]:
reg.intercept_

99.73998292839859

In [31]:
reg.score(x,y)

0.0011171724942857786

## adjustable R^2 formula..(obs: my adjusted r2 is -ve)
$R^2_{adj} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [33]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

-0.0029433268045180316

## feature selection

In [35]:
from sklearn.feature_selection import f_regression

In [36]:
f_regression(x,y)

(array([0.13472422, 0.2859887 , 0.16986776]),
 array([0.71368816, 0.59296325, 0.68034872]))

In [38]:
p_values = f_regression(x,y)[1].round(3)
p_values

array([0.714, 0.593, 0.68 ])

## creating a summary table

In [41]:
reg_summary = pd.DataFrame(data = x.columns.values, columns = ['features'])
reg_summary['coefficient'] = reg.coef_
reg_summary['p-values'] = p_values
reg_summary

Unnamed: 0,features,coefficient,p-values
0,Rating,1.039533,0.714
1,age,0.016331,0.593
2,Founded,-0.00198,0.68
