In [1]:
# import necessary libraries for data analysis and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
# load the dataset from the csv file into a pandas DataFrame
df = pd.read_csv('Boston.csv', index_col=0)
print(df.head()) # test opening the file

      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
1  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
2  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
3  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
4  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
5  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

    black  lstat  medv  
1  396.90   4.98  24.0  
2  396.90   9.14  21.6  
3  392.83   4.03  34.7  
4  394.63   2.94  33.4  
5  396.90   5.33  36.2  


In [3]:
# Define a function used to identify the three best feature variables that can serve as the predictors
# for the given target variable to train a suitable multiple linear regression model.
#
# Params:
#    features -> list of strings
#    target -> str
# 
# Returns a list of three feature names.
def get_indep_feats(features, target):

    # map correlation coefficient to the feature name
    corr_feat_dict  = {}
    for feat in features:
        corr_coef = np.corrcoef(df[feat], df[target])[0,1]
        corr_feat_dict[corr_coef] = feat
        print(f"Correlation coefficient between '{feat}' and '{target}':", corr_coef)
    
    # convert keys to list and sort in descending order using absolute value of correlation coefficients
    sorted_corr_coefs = sorted(corr_feat_dict.keys(), key=lambda x: abs(x), reverse=True)

    return [corr_feat_dict[sorted_corr_coefs[0]], 
            corr_feat_dict[sorted_corr_coefs[1]], 
            corr_feat_dict[sorted_corr_coefs[2]]]

In [4]:
# This block trains a multiple linear regression model using three feature variables for the target 
# variable NOX and display the coefficients and information on the variables from the model results.

target = 'nox'
features = df.columns.tolist() # get all variables
features.remove(target) # remove the target variable leaving only feature variables
indep_feats = get_indep_feats(features, target) # get three strongest independent features for target

# display each predictor/feature for NOX
print('\nThree independent features that can be used for NOX:', indep_feats, '\n') 

y = df[target] # define the dependent (target) variable
X = df[indep_feats] # define the independent (feature) variables
X = sm.add_constant(X) # add a constant term to the independent variables 'X' to adjust relative position

model = sm.OLS(y, X).fit() # fit a multiple linear regression model using the OLS method

print(model.summary()) # print a detailed summary of statistics for the fitted multiple linear regression model
print('\nModel Parameters:\n', model.params) # print the coefficients of the multiple linear regression model

Correlation coefficient between 'crim' and 'nox': 0.42097171139245637
Correlation coefficient between 'zn' and 'nox': -0.516603707827984
Correlation coefficient between 'indus' and 'nox': 0.763651446920915
Correlation coefficient between 'chas' and 'nox': 0.09120280684249514
Correlation coefficient between 'rm' and 'nox': -0.3021881878495937
Correlation coefficient between 'age' and 'nox': 0.7314701037859587
Correlation coefficient between 'dis' and 'nox': -0.7692301132258279
Correlation coefficient between 'rad' and 'nox': 0.6114405634855777
Correlation coefficient between 'tax' and 'nox': 0.6680232004030229
Correlation coefficient between 'ptratio' and 'nox': 0.1889326771127675
Correlation coefficient between 'black' and 'nox': -0.3800506377924005
Correlation coefficient between 'lstat' and 'nox': 0.5908789208808463
Correlation coefficient between 'medv' and 'nox': -0.42732077237328264

Three independent features that can be used for NOX: ['dis', 'indus', 'age'] 

                   

In [5]:
# Test results for above block output for discussion in assignment 1 report.

# Three largest absolute correlation coefficient values used for the model:
#    Model Parameters:
#        const    0.475795
#        dis     -0.017022
#        indus    0.006423
#        age      0.001050

# Three smallest absolute correlation coefficient values used for the model:
#    Model Parameters:
#        const      0.731637
#        chas       0.059008
#        ptratio    0.005699
#        rm        -0.045541

# Three middle valued absolute correlation coefficient values used for the model:
#    Model Parameters:
#        const    0.347594
#        tax      0.000345
#        lstat    0.005309
#        crim    -0.000279

In [6]:
# This block trains a multiple linear regression model using three feature variables for the target 
# variable NOX and display the coefficients and information on the variables from the model results.

target = 'medv'
features = df.columns.tolist() # get all variables
features.remove(target) # remove the target variable leaving only feature variables
indep_feats = get_indep_feats(features, target) # get three strongest independent features for target

# display each predictor/feature for MEDV
print('\nThree independent features that can be used for MEDV:', indep_feats, '\n') 

y = df[target] # define the dependent (target) variable
X = df[indep_feats] # define the independent (feature) variables
X = sm.add_constant(X) # add a constant term to the independent variables 'X' to adjust relative position

model = sm.OLS(y, X).fit() # fit a multiple linear regression model using the OLS method

print(model.summary()) # print a detailed summary of statistics for the fitted multiple linear regression model
print('\nModel Parameters:\n', model.params) # print the coefficients of the multiple linear regression model

Correlation coefficient between 'crim' and 'medv': -0.38830460858681165
Correlation coefficient between 'zn' and 'medv': 0.36044534245054305
Correlation coefficient between 'indus' and 'medv': -0.4837251600283728
Correlation coefficient between 'chas' and 'medv': 0.17526017719029846
Correlation coefficient between 'nox' and 'medv': -0.42732077237328264
Correlation coefficient between 'rm' and 'medv': 0.6953599470715393
Correlation coefficient between 'age' and 'medv': -0.3769545650045963
Correlation coefficient between 'dis' and 'medv': 0.24992873408590394
Correlation coefficient between 'rad' and 'medv': -0.3816262306397781
Correlation coefficient between 'tax' and 'medv': -0.46853593356776724
Correlation coefficient between 'ptratio' and 'medv': -0.5077866855375622
Correlation coefficient between 'black' and 'medv': 0.33346081965706653
Correlation coefficient between 'lstat' and 'medv': -0.737662726174015

Three independent features that can be used for MEDV: ['lstat', 'rm', 'ptratio