# Hiring
 A small linear regression project for hiring people and predicting people's salaries. Our dataset has both missing values and object data type, so we have to fill in the missing values and convert the object data type to numeric data type and prepare it for modeling. Because the data are all scales, there is no need to standardize.

In [37]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

In [38]:
df = pd.read_excel('hiring.xlsx')

In [39]:
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
...,...,...,...,...
91,two,8.0,10,65000
92,seven,9.0,6,70000
93,three,7.0,10,62000
94,ten,,7,72000


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       72 non-null     object 
 1   test_score       84 non-null     float64
 2   interview_score  96 non-null     int64  
 3   salary           96 non-null     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 3.1+ KB


In [41]:
df['experience'].isnull().sum()

24

In [42]:
df['experience'].fillna(0, inplace=True) # we assume None experience is equal 0 experience and fill non with with 0

df['test_score'].fillna(df['test_score'].mean(), inplace=True)

In [43]:
X=df.iloc[: , :3]
y = df.iloc[:, -1]

In [44]:
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6
...,...,...,...
91,two,8.0,10
92,seven,9.0,6
93,three,7.0,10
94,ten,7.5,7


In [45]:
y

0     50000
1     45000
2     60000
3     65000
4     70000
      ...  
91    65000
92    70000
93    62000
94    72000
95    89000
Name: salary, Length: 96, dtype: int64

In [47]:
#Converting words to integer values
def convert_to_int(word):
    word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8,
                'nine':9, 'ten':10, 'eleven':11, 'twelve':12, 'zero':0, 0: 0}
    return word_dict[word]

In [48]:
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,five,6.0,7
3,two,10.0,10
4,seven,9.0,6
...,...,...,...
91,two,8.0,10
92,seven,9.0,6
93,three,7.0,10
94,ten,7.5,7


In [51]:
X['experience'] = X['experience'].apply(convert_to_int)  # apply the function to all rows of "experience"

In [52]:
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
...,...,...,...
91,2,8.0,10
92,7,9.0,6
93,3,7.0,10
94,10,7.5,7


In [58]:
#Splitting Training and Test Set`
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 101)

In [59]:
# Creating and Training the Model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

#Fitting model with trainig data
lm.fit(x_train, y_train)

In [60]:
#test the model
predictions = lm.predict(x_test)

In [61]:
from sklearn import metrics

In [62]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 2165.339570799884
MSE: 8112909.576770614
RMSE: 2848.3169726648425


In [63]:
# Saving model to disk
pickle.dump(lm, open('model.pkl','wb'))

In [72]:
# Loading model to compare the results
model = pickle.load(open('model.pkl','rb'))
print("The possible salary of this personnel is {:.4f} $".format( float(model.predict([[3, 9, 6]]))))

The possible salary of this personnel is 58386.5844 $






*Written by Kasra Tehrani*


**https://www.linkedin.com/in/kasra-naderi-tehrani-a298b521b/ - https://github.com/kasra-python - kasra.n.tehrani@gmail.com**

*Good luck!*

