In [1]:
#Polynomial Regression
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
#Defining dataset
dataset = pd.read_csv('Position_Salaries.csv')

In [3]:
dataset

Unnamed: 0,Position,Level,Salary
0,Business Analyst,1,45000
1,Junior Consultant,2,50000
2,Senior Consultant,3,60000
3,Manager,4,80000
4,Country Manager,5,110000
5,Region Manager,6,150000
6,Partner,7,200000
7,Senior Partner,8,300000
8,C-level,9,500000
9,CEO,10,1000000


In [4]:
#Extracting Independant and Dependant variables
iv = dataset.iloc[:,:2].values
dv = dataset.iloc[:,2].values

In [5]:
iv

array([['Business Analyst', 1],
       ['Junior Consultant', 2],
       ['Senior Consultant', 3],
       ['Manager', 4],
       ['Country Manager', 5],
       ['Region Manager', 6],
       ['Partner', 7],
       ['Senior Partner', 8],
       ['C-level', 9],
       ['CEO', 10]], dtype=object)

In [11]:
#Imputation is not required as there are no NULL values in the dataset
#Encoding for converting text cols to numbers
#Label Encoder
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
encoder = LabelEncoder()
iv[:,0] = encoder.fit_transform(iv[:,0])
#OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
iv = ohe.fit_transform(iv).toarray()

In [12]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
iv_train,iv_test,dv_train,dv_test = train_test_split (iv, dv, test_size =0.2,random_state = 10)




In [13]:
dv_test

array([500000,  60000], dtype=int64)

In [14]:
#Applying Linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(iv_train, dv_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
#Finding Intercept and Coefficient
lin_reg.intercept_


-198429.67244701402

In [16]:
lin_reg.coef_

array([ 1.61512524e+05,  5.82076609e-11,  3.79258189e+05, -1.01156069e+05,
        8.45953757e+04, -4.92389210e+04, -1.74990366e+05, -1.43073218e+05,
        0.00000000e+00, -1.56907514e+05,  8.19171484e+04])

In [17]:
dv_pred = lin_reg.predict(iv_test)

In [18]:
dv_pred

array([538824.6628131 ,  47321.77263969])

In [19]:
lin_reg.score(iv_test,dv_test)

0.9827676457484423

In [20]:
## check the score, with R2,  

from sklearn.metrics import r2_score

r2_score(dv_test, dv_pred)

0.9827676457484423

In [21]:
#Calculate RMSE
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [22]:
print("Root Mean squared error: %.2f"
      % rmse(dv_test, dv_pred))

Root Mean squared error: 28879.85


In [23]:
#Add the polynomial Features in the feature set
from sklearn.preprocessing import PolynomialFeatures
poly_iv = PolynomialFeatures(degree=6,include_bias='false')


In [24]:
iv_train_poly = poly_iv.fit_transform(iv)

In [25]:
iv_train_poly
iv[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [26]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, iv, dv):
    #iv_train, iv_val, dv_train, dv_val = train_test_split(iv, dv, test_size=0.2, random_state=10)
    train_errors = []
    val_errors = []
    for m in range(1, len(iv_train)):
        model.fit(iv_train[:m], dv_train[:m])
        iv_train_poly = model.predict(iv_train[:m])
        dv_val_predict = model.predict(iv_val)
        train_errors.append(mean_squared_error(dv_train_predict, dv_train[:m]))
        val_errors.append(mean_squared_error(dv_val_predict, dv_val))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)   # not shown in the book
    plt.xlabel("Training set size", fontsize=14) # not shown
    plt.ylabel("RMSE", fontsize=14)              # not shown
    plt.show()