In [76]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [77]:
# import the CSV data file into a dataframe
dfX = pd.read_csv("bostonWeights.csv")
dfY = pd.read_csv("bostonTarget.csv")

# print new dataframe
print(dfX, dfY)

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1    0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2    0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3    0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4    0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
503  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
504  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
505  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   

     PTRATIO       B  LSTAT  
0       15.3  396.90   4.98  
1       17.8  396.90   9.14  
2       1

In [78]:
# gather stats about about final prices
# MEDV is all times 1000 for actual price
dfY.describe()

Unnamed: 0,0
count,506.0
mean,22.532806
std,9.197104
min,5.0
25%,17.025
50%,21.2
75%,25.0
max,50.0


In [79]:
# gather stats about about the weights/features
dfX.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [80]:
# split the data frames into two sub frames
# these are the  train and test sets
# it returns a tuple of 4 sets (xTrain, xTest, yTrain, yTest)
xTrain, xTest, yTrain, yTest = train_test_split(
    dfX, dfY, test_size=0.30, random_state=42 )

In [81]:
# print out each to check that it worked
print(xTrain, xTest, yTrain, yTest)

         CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS   RAD    TAX  \
5     0.02985   0.0   2.18   0.0  0.458  6.430  58.7  6.0622   3.0  222.0   
116   0.13158   0.0  10.01   0.0  0.547  6.176  72.5  2.7301   6.0  432.0   
45    0.17142   0.0   6.91   0.0  0.448  5.682  33.8  5.1004   3.0  233.0   
16    1.05393   0.0   8.14   0.0  0.538  5.935  29.3  4.4986   4.0  307.0   
468  15.57570   0.0  18.10   0.0  0.580  5.926  71.0  2.9084  24.0  666.0   
..        ...   ...    ...   ...    ...    ...   ...     ...   ...    ...   
106   0.17120   0.0   8.56   0.0  0.520  5.836  91.9  2.2110   5.0  384.0   
270   0.29916  20.0   6.96   0.0  0.464  5.856  42.1  4.4290   3.0  223.0   
348   0.01501  80.0   2.01   0.0  0.435  6.635  29.7  8.3440   4.0  280.0   
435  11.16040   0.0  18.10   0.0  0.740  6.629  94.6  2.1247  24.0  666.0   
102   0.22876   0.0   8.56   0.0  0.520  6.405  85.4  2.7147   5.0  384.0   

     PTRATIO       B  LSTAT  
5       18.7  394.12   5.21  
116     17.8  3

In [82]:
# assign and train linear regression model
lin = skl.linear_model.LinearRegression().fit(xTrain, yTrain)

In [83]:
# predict y values for test set
yTestPredictions = lin.predict(xTest)

In [84]:
# compare predictions with actual yTest set
print(skl.metrics.mean_squared_error(yTest, yTestPredictions))

21.51744423117743
