**Polynomial** **Regression**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

In [2]:
df= pd.read_csv('Amazon_Books.csv')

In [3]:
df.head()

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350.0,8.0,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052.0,22.0,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979.0,15.0,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424.0,6.0,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665.0,12.0,2019,Non Fiction


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         550 non-null    object 
 1   Author       550 non-null    object 
 2   User Rating  550 non-null    float64
 3   Reviews      547 non-null    float64
 4   Price        547 non-null    float64
 5   Year         550 non-null    int64  
 6   Genre        550 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 30.2+ KB


In [5]:
df.isnull().sum()

Name           0
Author         0
User Rating    0
Reviews        3
Price          3
Year           0
Genre          0
dtype: int64

In [6]:
# Handling missing values
numerical_cols = df.select_dtypes(include=["float64","int64"]).columns
categorical_cols = df.select_dtypes(include="object").columns

In [7]:
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

In [8]:
# Converting object columns to numerical using OneHotEncoder
encoder = OneHotEncoder()
encoded_cols = encoder.fit_transform(df[categorical_cols]).toarray()

In [9]:
feature_names = encoder.get_feature_names_out(categorical_cols)

In [10]:
df = pd.concat([df[numerical_cols], pd.DataFrame(encoded_cols, columns=feature_names)], axis=1)
df

Unnamed: 0,User Rating,Reviews,Price,Year,Name_10-Day Green Smoothie Cleanse,Name_11/22/63: A Novel,Name_12 Rules for Life: An Antidote to Chaos,Name_1984 (Signet Classics),"Name_5,000 Awesome Facts (About Everything!) (National Geographic Kids)",Name_A Dance with Dragons (A Song of Ice and Fire),...,Author_Tucker Carlson,Author_Veronica Roth,Author_W. Cleon Skousen,Author_Walter Isaacson,Author_William Davis,Author_William P. Young,Author_Wizards RPG Team,Author_Zhi Gang Sha,Genre_Fiction,Genre_Non Fiction
0,4.7,17350.0,8.0,2016,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4.6,2052.0,22.0,2011,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4.7,18979.0,15.0,2018,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4.7,21424.0,6.0,2017,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4.8,7665.0,12.0,2019,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,4.9,9413.0,8.0,2019,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
546,4.7,14331.0,8.0,2016,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
547,4.7,14331.0,8.0,2017,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
548,4.7,14331.0,8.0,2018,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# Splitting the data
x= df.iloc[:, 2:-2].values
y = df.iloc[:, -2].values

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=40)

In [13]:
# Feature Scaling
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)

In [14]:
print("x_train_scaled:\n", x_train_scaled)
print("\nx_test_scaled:\n", x_test_scaled)

x_train_scaled:
 [[-0.1127756   1.52537617  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 [ 2.33334312 -0.31945051  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 [-0.20013699 -1.54933496  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 ...
 [ 0.14930855  0.91043394  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 [-1.1611122  -0.01197939  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 [-0.46222114  0.60296283  0.         ... -0.05103104 -0.05103104
  -0.07226294]]

x_test_scaled:
 [[-0.46222114  0.60296283  1.         ... -0.05103104 -0.05103104
  -0.07226294]
 [-0.54958252 -0.01197939  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 [-0.81166667  0.60296283  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 ...
 [-0.28749837 -0.62692162  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 [-0.46222114  0.60296283  0.         ... -0.05103104 -0.05103104
  -0.07226294]
 [ 0.93556099 -1.54933496  0.         ... -0.05103104 -0.0510310

In [15]:
# Linear Regression and prediction
lin_reg = LinearRegression()
lin_reg.fit(x, y)

In [16]:
# Training the Polynomial Regression model on the whole dataset
poly_reg = PolynomialFeatures(degree = 2)
x_poly = poly_reg.fit_transform(x)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(x_poly, y)

In [17]:
# Visualising the Linear Regression results

In [18]:
# Visualising the Polynomial Regression results

In [19]:
# Visualising the Polynomial Regression results (for higher resolution and smoother curve)

In [20]:
# Predicting a new result with Linear Regression
input_data_601 = np.array([[6.5] + [0.0] * 600])
prediction_601 = lin_reg.predict(input_data_601)
# Print the prediction
print(prediction_601)

[0.37803676]


In [21]:
# Predicting a new result with Polynomial Regression
lin_reg_2.predict(poly_reg.fit_transform(input_data_601))

array([0.55489591])

In [33]:
# Predicting the Test set results
y_pred = lin_reg_2.predict(poly_reg.transform(x_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.to_numpy().reshape(len(y_test),1)),1))

[[ 2.62e-13  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [-3.52e-13  0.00e+00]
 [ 1.87e-13  0.00e+00]
 [-7.24e-14  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [-1.72e-14  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [-6.28e-14  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 2.95e-13  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [-6.22e-15  0.00e+00]
 [ 2.07e-13  0.00e+00]
 [ 1.11e-12  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 1.55e-13  0.00e+00]
 [ 8.47e-13  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 3.31e-13  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 4.30e-14  0.00e+00]
 [-4.46e-13  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 7.32e-14  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 6.36e-13  0.00e+00]
 [ 3.64e-13  0.00e+00]
 [ 3.13e-13  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 2.76e-14  0.00e+00]
 [ 2.89e-13  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 1.00e+00  1.00e+00]
 [ 8.92e-14  0.00e+00]
 [ 1.00e+00  1.00e+00]
 [-1.26e-13  0.00e+00]
 [ 1.77e-13

In [35]:
# Evaluating the Model Performance
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

1.0