In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from word2number import w2n

In [2]:
df = pd.read_csv('hiring.csv')
df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       6 non-null      object 
 1   test_score       7 non-null      float64
 2   interview_score  8 non-null      int64  
 3   salary           8 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 388.0+ bytes


In [4]:
df.isnull().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [5]:
def safe_word_to_num(val):
    try:
        return w2n.word_to_num(val)
    except:
        return val  # Return the value as is if it cannot be converted

df['experience'] = df['experience'].apply(safe_word_to_num)
df

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,5.0,6.0,7,60000
3,2.0,10.0,10,65000
4,7.0,9.0,6,70000
5,3.0,7.0,10,62000
6,10.0,,7,72000
7,11.0,7.0,8,80000


In [6]:
df['experience']= df['experience'].fillna(df.experience.mean())
df['test_score']= df['test_score'].fillna(df.test_score.mean())
df

Unnamed: 0,experience,test_score,interview_score,salary
0,6.333333,8.0,9,50000
1,6.333333,8.0,6,45000
2,5.0,6.0,7,60000
3,2.0,10.0,10,65000
4,7.0,9.0,6,70000
5,3.0,7.0,10,62000
6,10.0,7.857143,7,72000
7,11.0,7.0,8,80000


In [7]:
df.isnull().sum()

experience         0
test_score         0
interview_score    0
salary             0
dtype: int64

In [8]:
X = df.drop(columns=['salary'])
y = df['salary']
X

Unnamed: 0,experience,test_score,interview_score
0,6.333333,8.0,9
1,6.333333,8.0,6
2,5.0,6.0,7
3,2.0,10.0,10
4,7.0,9.0,6
5,3.0,7.0,10
6,10.0,7.857143,7
7,11.0,7.0,8


In [9]:
y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [11]:
print(x_train)
print(x_test)
print(y_train)
print(y_test)

   experience  test_score  interview_score
1    6.333333         8.0                6
7   11.000000         7.0                8
3    2.000000        10.0               10
0    6.333333         8.0                9
5    3.000000         7.0               10
4    7.000000         9.0                6
   experience  test_score  interview_score
6        10.0    7.857143                7
2         5.0    6.000000                7
1    45000
7    80000
3    65000
0    50000
5    62000
4    70000
Name: salary, dtype: int64
6    72000
2    60000
Name: salary, dtype: int64


In [12]:
model = LinearRegression()
model.fit(x_train, y_train)
print('Model Coefficient: ',model.coef_)
print('Model Intercept: ',model.intercept_)

Model Coefficient:  [3548.04326966 4160.89438696 4185.01439331]
Model Intercept:  -27249.40114179997


In [13]:
salary_pred = model.predict(x_test)
salary_pred

array([70218.87391976, 44751.28228139])

In [14]:
print('MSE: ',mean_squared_error(y_test, salary_pred))
print('R2 Score: ',r2_score(y_test, salary_pred))
print('RMSE: ',np.sqrt(mean_squared_error(y_test, salary_pred)))

MSE:  117847901.08776331
R2 Score:  -2.273552807993425
RMSE:  10855.77731384369
