<a href="https://colab.research.google.com/github/iamhariharanvj/Learning-ML/blob/main/Regression_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regression Models

### Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the Dataset

In [None]:
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
X = np.array(df.iloc[:,:-1].values) # Enter the rows for your independent variable
y = df.iloc[:,-1].values   # Enter the rows for your dependent variable

In [None]:
X[:5]

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida']], dtype=object)

In [None]:
y[:5]

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94])

### Doing One Hot Encoding if necessary

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

columns = [3]
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),columns)] , remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

### Spliting the dataset into Training and Training Set

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = test_size,random_state=0)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()

tfm_y_train = y_train.reshape(len(y_train),1)
tfm_y_test = y_test.reshape(len(y_test),1)

tfm_X_train = sc_X.fit_transform(X_train)
tfm_y_train = sc_y.fit_transform(tfm_y_train)
tfm_X_test = sc_X.transform(X_test)
tfm_y_test = sc_y.transform(tfm_y_test)

### Using Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
lin_ypred = lin_reg.predict(X_test)

### Using Polynomial Regression

In [None]:
degree = 5
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree = degree)
X_train_poly = pf.fit_transform(tfm_X_train)
poly_reg = LinearRegression()
poly_reg.fit(X_train_poly,tfm_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
poly_y_pred = sc_y.inverse_transform(poly_reg.predict(pf.transform(tfm_X_test)))
poly_y_pred

array([[-143486.99838273],
       [ 116097.58592588],
       [ 122672.82827198],
       [  43426.15926669],
       [ -20668.40884393],
       [ 112642.91595758],
       [  64196.87281008],
       [  97791.50001361],
       [ 133112.41463401],
       [  16164.71883612]])

### Using Support Vector Regression

In [None]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(tfm_X_train,tfm_y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
svm_y_pred = sc_y.inverse_transform(svr.predict(tfm_X_test))
svm_y_pred

array([104556.64324384, 123328.74535064, 128933.79323066,  88368.84350653,
       143345.54646792, 122377.92156135,  80981.87002344, 102957.37922236,
       119140.36087272, 137052.36540438])

### Using Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(X_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
tree_y_pred = tree.predict(X_test)
tree_y_pred

array([101004.64, 141585.52, 141585.52,  78239.91, 182901.99, 107404.34,
        69758.98,  99937.59, 108733.99, 182901.99])

### Using Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 20,random_state=0)
forest.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [None]:
forest_y_pred = forest.predict(X_test)
forest_y_pred

array([103282.6675, 134605.9415, 136183.654 ,  82186.735 , 182177.4075,
       112090.024 ,  77021.9275,  99415.0115, 111796.943 , 164487.8265])

In [None]:
from sklearn.metrics import r2_score
print('The score of Multiple Regression Model is ',r2_score(y_test,lin_ypred))
print('The score of Polynomial Regression Model is ',r2_score(y_test,poly_y_pred))
print('The score of Support Vector Regression Model is ',r2_score(y_test,svm_y_pred))
print('The score of Decison Tree Regression Model is ',r2_score(y_test,tree_y_pred))
print('The score of Random Forest Regression Model is ',r2_score(y_test,forest_y_pred))


The score of Multiple Regression Model is  0.9347068473282515
The score of Polynomial Regression Model is  -9.291558377951263
The score of Support Vector Regression Model is  0.6574795005932488
The score of Decison Tree Regression Model is  0.9589635197429642
The score of Random Forest Regression Model is  0.9713422791312948
