# Introduction to this python notebook

In [51]:
"""
What? An example if linear regression

References: https://www.analyticssteps.com/blogs/how-does-linear-and-logistic-regression-work-machine-learning 
"""

'\nWhat? An example if linear regression\nSupervised/Unsupervised? Supervised\nMain library used? sklearn\nDate: 15/11/20\n\nRefernces:\nhttps://www.analyticssteps.com/blogs/how-does-linear-and-logistic-regression-work-machine-learning \n'

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Loading the test data

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()

In [3]:
# Examining the data set
# Essentially it prints out what we can access via "." notations
print(dir(boston))

['DESCR', 'data', 'feature_names', 'filename', 'target']


In [4]:
print("Shape of target matrix: ", boston.data.shape)
print("Shape of target vector: ", boston.target.shape)
print("What features are availables? ", boston.feature_names)

Shape of target matrix:  (506, 13)
Shape of target vector:  (506,)
What features are availables?  ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [6]:
# We then use pandas to create a matrix which maps data and features
bos = pd.DataFrame(boston.data)

bos.columns = boston.feature_names

print("Print panda matrix head \n", bos.head())
print("Panda matrix shape: ", bos.shape)
print("Accessing the data under the INDUS feature \n", bos["INDUS"])

Print panda matrix head 
       CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  
0     15.3  396.90   4.98  
1     17.8  396.90   9.14  
2     17.8  392.83   4.03  
3     18.7  394.63   2.94  
4     18.7  396.90   5.33  
Panda matrix shape:  (506, 13)
Accessing the data under the INDUS feature 
 0       2.31
1       7.07
2       7.07
3       2.18
4       2.18
       ...  
501    11.93
502    11.93
503    11.93
504    11.93
505    11.93
Name: INDUS, Length: 506, dtype: float64


In [8]:
# In this step, we are going to split the dataset into training and test sets.
# 80% training & 20% test
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.2)

print("Xtrain shape", X_train.shape)
print("Xtest shape", X_test.shape)
print("TargetTrain shape", y_train.shape)
print("TargetTest shape", y_test.shape)

Xtrain shape (404, 13)
Xtest shape (102, 13)
TargetTrain shape (404,)
TargetTest shape (102,)


In [10]:
#Now, we are going to fit our dataset using linear regression from sklearnt
sklinreg = LinearRegression(normalize=True)
sklinreg.fit(X_train, y_train)

LinearRegression(normalize=True)

In [11]:
# Now that we create an object of our linear regression we may 
# be curious to see how many methods are availables
print(dir(sklinreg))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_n_features', '_decision_function', '_estimator_type', '_get_param_names', '_get_tags', '_more_tags', '_preprocess_data', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_residues', '_set_intercept', '_validate_data', 'coef_', 'copy_X', 'fit', 'fit_intercept', 'get_params', 'intercept_', 'n_features_in_', 'n_jobs', 'normalize', 'predict', 'rank_', 'score', 'set_params', 'singular_']


In [13]:
# In the last step, we are printing test results on our dataset.
print("Train accuracy score:", sklinreg.score(X_train, y_train))
print("Test accuracy score:", sklinreg.score(X_test, y_test))

Train accuracy score: 0.7447166716363611
Test accuracy score: 0.719123329925571
