# Regression with Scikit-Learn

### Regression on random generated dataset

#### Import Libraries

In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

#### Create Regression dataset

In [2]:
X, y = datasets.make_regression(
                        n_samples=1000,
                        n_features=10,
                        n_informative=8,
                        noise=1,
                        random_state=43)

In [3]:
X = pd.DataFrame(X)
X.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']
X

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10
0,-2.409355,1.715185,-2.318154,0.449544,0.306028,-0.122155,-0.926671,0.679602,1.175889,0.603021
1,-0.844992,-0.213130,-0.959884,1.692580,-0.575497,0.552339,0.710395,-0.996281,0.192422,0.859260
2,-0.953270,-0.060433,0.122299,-0.467788,0.003849,-1.396142,-1.877393,-0.078378,-1.056249,2.734287
3,-1.926364,-2.340477,-0.332578,-0.419339,0.088185,-1.997641,-1.199337,0.025059,0.427709,-0.847966
4,-0.406896,-0.089699,0.648932,2.713480,-1.497093,-1.314372,0.161335,1.392263,0.906815,0.407956
...,...,...,...,...,...,...,...,...,...,...
995,0.537434,0.952931,1.458873,1.051285,0.263045,-0.367154,-0.567839,0.235110,-0.046830,0.202983
996,0.312891,0.997110,0.874307,-1.921856,-1.030479,-1.639469,-0.413027,-0.324526,-1.549008,0.351511
997,-0.516363,1.378709,-1.281165,1.746962,-1.252374,-0.471356,0.919734,-2.360842,0.952831,1.037480
998,-1.920197,-0.433793,0.228492,0.730748,-1.543144,-0.156774,1.290942,0.530109,0.914998,0.832432


#### Split dataset into training dataset and testing dataset

In [4]:
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)

#### Regression models

In [5]:
lr = LinearRegression()
ada = AdaBoostRegressor()
rfr = RandomForestRegressor()
dtr = DecisionTreeRegressor()
svr = SVR()

#### Train Model on training dataset

In [6]:
models = [lr, ada, rfr, dtr, svr]
for model in models:
    model.fit(train_X,train_y)

#### Prediction on testing dataset

In [7]:
def predict(X, y, model):
    pred = model.predict(X)
    print(mean_squared_error(y, pred))

In [8]:
for model in models:
    print(model)
    predict(test_X, test_y, model)

LinearRegression()
1.0740034790446282
AdaBoostRegressor()
6248.396538894281
RandomForestRegressor()
4081.4249813914353
DecisionTreeRegressor()
9882.233958403684
SVR()
17777.597237156726


### Regression on California housing price

#### Load Dataset

In [9]:
data = datasets.fetch_california_housing()

In [10]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [11]:
y = data.target

In [12]:
X = pd.DataFrame(data.data, columns=data.feature_names)

In [13]:
X.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [14]:
X.shape

(20640, 8)

#### Split Dataset

In [15]:
train_X, test_X, train_y, test_y = train_test_split(X, y)

#### Regression Model

In [16]:
lr = LinearRegression()
ada = AdaBoostRegressor()
rfr = RandomForestRegressor()
dtr = DecisionTreeRegressor()
svr = SVR()

#### Train Model on training dataset

In [17]:
models = [lr, ada, rfr, dtr, svr]
for model in models:
    model.fit(train_X,train_y)

#### Prediction on testing dataset

In [18]:
for model in models:
    print(model)
    predict(test_X, test_y, model)

LinearRegression()
0.5168386529359613
AdaBoostRegressor()
0.7544915627572881
RandomForestRegressor()
0.25670733006353624
DecisionTreeRegressor()
0.5404085366168411
SVR()
1.392066195201188
