PolynomialLinerRegression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('dataset\\Position_Salaries.csv')
data

Unnamed: 0,Position,Level,Salary
0,Business Analyst,1,45000
1,Junior Consultant,2,50000
2,Senior Consultant,3,60000
3,Manager,4,80000
4,Country Manager,5,110000
5,Region Manager,6,150000
6,Partner,7,200000
7,Senior Partner,8,300000
8,C-level,9,500000
9,CEO,10,1000000


In [3]:
sorted(data.Salary)

[45000, 50000, 60000, 80000, 110000, 150000, 200000, 300000, 500000, 1000000]

In [4]:
# Steps
#1. Arrange your data in ascending order
#2. Calculate Q1 (the 1st quarter) Q1,Q3 = np.percentile(sample.Salary,[25,75])
#3. Calculate Q3 (the 3rd quarter)
#4. Find IQR = Q3 -Q1
#5. Find lower range = Q1- (1.5*IQR)
#6. Find uppser range = Q3 + (1.5*IQR)

In [5]:
Q1,Q3 = np.percentile(data.Salary,[25,75])

In [6]:
Q1

65000.0

In [7]:
Q3

275000.0

In [8]:
IQR = Q3 -Q1
IQR

210000.0

In [9]:
low_range = Q1- (1.5*IQR)
upper_range = Q3 + (1.5*IQR)

In [10]:
low_range

-250000.0

In [11]:
upper_range

590000.0

In [12]:
data.drop(data[ (data.Salary > upper_range) | (data.Salary < low_range) ].index , inplace=True)
data

Unnamed: 0,Position,Level,Salary
0,Business Analyst,1,45000
1,Junior Consultant,2,50000
2,Senior Consultant,3,60000
3,Manager,4,80000
4,Country Manager,5,110000
5,Region Manager,6,150000
6,Partner,7,200000
7,Senior Partner,8,300000
8,C-level,9,500000


In [13]:
# Seperate your data as Features and Lable
features = data.iloc[:,[1]].values
features

array([[1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]], dtype=int64)

In [14]:
label = data.iloc[:,2].values
label

array([ 45000,  50000,  60000,  80000, 110000, 150000, 200000, 300000,
       500000], dtype=int64)

In [15]:
print(features.shape)
print(label.shape)

(9, 1)
(9,)


In [16]:
label=label.reshape(-1,1)
label.shape

(9, 1)

In [17]:
# To create  Polynomial Features (degree 3)
from sklearn.preprocessing import PolynomialFeatures

polyFeatures = PolynomialFeatures(degree=3)
features  = polyFeatures.fit_transform(features)

In [18]:
features

array([[  1.,   1.,   1.,   1.],
       [  1.,   2.,   4.,   8.],
       [  1.,   3.,   9.,  27.],
       [  1.,   4.,  16.,  64.],
       [  1.,   5.,  25., 125.],
       [  1.,   6.,  36., 216.],
       [  1.,   7.,  49., 343.],
       [  1.,   8.,  64., 512.],
       [  1.,   9.,  81., 729.]])

In [19]:
label

array([[ 45000],
       [ 50000],
       [ 60000],
       [ 80000],
       [110000],
       [150000],
       [200000],
       [300000],
       [500000]], dtype=int64)

In [20]:
# Create training and testing set
# Training set will be used to test our model (learning)
# Testing set will be used to test our model accuracy/genralization (testing)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    label,
                                                    test_size=0.2,
                                                    random_state=1)

In [21]:
#Now we will create our Model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
#Check the generalization
print(lr.score(X_train, y_train)) #Known data
print(lr.score(X_test, y_test)) #Unknown data

0.9973273513268982
0.9254447046651799


In [23]:
#Technique to do experimental trail and error to find out best random_state
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

for i in range(1,101):
    X_train, X_test, y_train, y_test = train_test_split(features,label,test_size=0.2,random_state=i)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    train_score = lr.score(X_train, y_train)
    test_score = lr.score(X_test, y_test)
    
    if train_score < test_score:
        print('Train Score:{} Test Score:{} Random Seed : {}'.format(train_score,test_score,i))

In [24]:
#Applying K-NN Assuming k = 3
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=3,
                             metric='minkowski',
                             p = 2)
model.fit(X_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='uniform')

In [25]:
model.score(X_train,y_train)

0.7848810673431247

In [26]:
model.score(X_test,y_test)

0.9720987654320987

In [28]:
#Equation 

#Salary = b0 + b1*(R&D)+b2(Adm)+b4*(Marketing)+b5*(Newyk)+B6*(Florid)   ## b0=intercept and b1=coefficient
print("The Salary based on the lavel = {} + {}*Level".format(lr.intercept_,lr.coef_))

The Salary based on the lavel = [-2474.4566227] + [[     0.          58560.93109112 -16585.21536053   1793.24829195]]*Level


In [None]:
# #Technique to do experimental trail and error to find out best random_state

# from sklearn.neighbors import KNeighborsRegressor

# for i in range(1,101):
#     model = KNeighborsRegressor(n_neighbors=i,
#                              metric='minkowski',
#                              p = 2)
#     model.fit(X_train,y_train)
#     train_score = model.score(X_train, y_train)
#     test_score = model.score(X_test, y_test)
#     if train_score < test_score:
#         print('Train Score:{} Test Score:{} n_neighbors : {}'.format(train_score,test_score,i))