# **Simple Linear Regression**

In [283]:
# Step 1 : import library
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [284]:
# Step 2 : import data
salary = pd.read_csv('https://github.com/ybifoundation/Dataset/raw/main/Salary%20Data.csv')

# Display the first 5 rows
print(salary.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(salary.info())

| Experience Years   | Salary   |
|:-------------------|:---------|
| 1.1                | 39343    |
| 1.2                | 42774    |
| 1.3                | 46205    |
| 1.5                | 37731    |
| 2                  | 43525    |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Experience Years  40 non-null     float64
 1   Salary            40 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 772.0 bytes
None


## Data Preprocessing

In [285]:
salary.columns

Index(['Experience Years', 'Salary'], dtype='object')

In [286]:
# dependent variable or feature
# we use double '[[]]' to make the X as dataframe since there is only one feature if we don't use double '[[]]' the X will be considered as series
X = salary[['Experience Years']]
X.head()

Unnamed: 0,Experience Years
0,1.1
1,1.2
2,1.3
3,1.5
4,2.0


In [287]:
# dependent or target variable
# target variable is series datatype
y = salary['Salary']
y[:5]

0    39343
1    42774
2    46205
3    37731
4    43525
Name: Salary, dtype: int64

In [288]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=2529)
# check shape of train and test sample
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((28, 1), (12, 1), (28,), (12,))

### Training data
#### training data undergoes fit and transform in the datapreprocessing stage. In fit operation the mean and standard deviation of the provided training data is calculated and in transform the standardization equation is applied to each data and the normalized form (mean=0, standard deviation=1) is achived. During the fit operation the mean and SD of the dataset is stored in the initialized StandardScaler variable. These values are later used for transforming

In [289]:
data = [[0, 0], [0, 0], [2, 1], [2, 1]]

In [290]:
# standardizing the features (setting mean=0 and standard deaviation = 1)
from sklearn.preprocessing import StandardScaler


## Scale these features
scaler_test=StandardScaler()

In [291]:
# we can't transform without doing fit operation
# data_tramsform = scaler.transform(data)
# print(data_tramsform)

In [292]:
data_fit=scaler_test.fit(data)
print(data_fit)
# mean of the provided data
print(f'mean : {data_fit.mean_}')
# standard deviation of the provided data
print(f'standard deviation : {data_fit.scale_}')

# the mean and SD calculated in the fit operation is utalized for transforming the data
data_tramsform = scaler_test.transform(data)
print(data_tramsform)

print(f'custom values [2,3] : {scaler_test.transform([[2, 3]])}')
# (2 - 1)/1 = 1
# (3-0.5)/0.5 = 5

StandardScaler()
mean : [1.  0.5]
standard deviation : [1.  0.5]
[[-1. -1.]
 [-1. -1.]
 [ 1.  1.]
 [ 1.  1.]]
custom values [2,3] : [[1. 5.]]


In [293]:
# the above operations can be done in a single fit_transform step
scaler_test=StandardScaler()
scaler_test.fit_transform(data)

array([[-1., -1.],
       [-1., -1.],
       [ 1.,  1.],
       [ 1.,  1.]])

In [294]:
X_train[:5]

Unnamed: 0,Experience Years
28,6.5
31,7.9
34,8.7
20,4.7
4,2.0


In [295]:
scaler=StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train[:5]

array([[ 0.46553075],
       [ 0.98543974],
       [ 1.28253059],
       [-0.20292366],
       [-1.20560528]])

### Test data
#### test data undergoes transformation, since we have already found out the mean and SD using the train data, we use those value to transform the test data

In [296]:

X_test=scaler.transform(X_test)
X_test[:5]

array([[ 0.57693982],
       [-0.64855994],
       [ 1.20825788],
       [-0.46287816],
       [-0.27719637]])

## Model training

In [297]:
# Select model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [298]:
# Train or fit the training data to the model
model.fit(X_train,y_train)

In [299]:
model.intercept_

np.float64(75942.85714285714)

In [300]:
model.coef_

array([25327.24682845])

In [301]:
# Step 7 : predict model
y_pred = model.predict(X_test)

In [302]:
# salary for 2 year experience
print(model.predict(scaler.transform([[2]])))

[45408.19457574]




In [303]:
y_pred

array([ 90555.15441095,  59516.61952424, 106544.70268592,  64219.42784041,
        68922.23615658, 123474.81262412,  84911.78443155,  63278.86617718,
        65159.98950364,  61397.74285071,  37883.70126987,  50111.00289191])

In [304]:
# model accuracy
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [305]:
mean_absolute_error(y_test,y_pred)

np.float64(4005.9263101681754)

In [306]:
mean_absolute_percentage_error(y_test,y_pred)

np.float64(0.06384602996141629)

In [307]:
mean_squared_error(y_test,y_pred)

np.float64(24141421.671440955)