In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.xkcd()

# Load up Boston Dataset

In [None]:
from sklearn.datasets import load_boston

In [None]:
data = load_boston()

## Quick data exploration and explanation

In [None]:
data.keys()

In [None]:
data['feature_names']

In [None]:
print(data['DESCR'])

## For simplicity lets make a dataframe

In [None]:
df = pd.DataFrame(data['data'], columns=data['feature_names'])

In [None]:
df.head()

### Also we need to add the target in there 

In [None]:
df['MEDV'] = data['target']

In [None]:
df.head()

## Let's make this simple and use only 1 variable and output

In [None]:
room_df = df[['RM', "MEDV"]]

### Linear Regression

Fit a line to the data.  Use that line to predict the values of unseen data

In [None]:
room_df.plot.scatter(x='RM', y='MEDV')

### This is a bit of a strang hack

The fit won't except a 1d set of data.  Normally this isn't an issue as more than 1 feature will be used

In [None]:
rm_df = pd.DataFrame(df['RM'])

## Split up our data into test and train

We need to be able to evaluate our data so we will hold out some

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(rm_df, room_df['MEDV'], test_size=0.2, random_state=42)

In [None]:
len(X_train), y_train.count()

In [None]:
len(X_test), y_test.count()

## Now time to fit the model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression().fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
plt.plot(y_test, y_pred, '.')

# plot a line, a perfit predict would all fall on this line
x = np.linspace(0, 60, 100)
y = x
plt.plot(x, y)
plt.show()

## Now let's try some more variables

In [None]:
df.head()

In [None]:
sub_df = df[['CRIM', 'RM', 'PTRATIO', 'ZN', 'MEDV']]

In [None]:
sub_df.iloc[:, :-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sub_df.iloc[:, :-1], sub_df['MEDV'], test_size=0.2, random_state=42)

In [None]:
X_train.count(), y_train.count()

In [None]:
X_test.count(), y_test.count()

In [None]:
model = LinearRegression().fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
plt.plot(y_test, y_pred, '.')

# plot a line, a perfit predict would all fall on this line
x = np.linspace(0, 60, 100)
y = x
plt.plot(x, y)
plt.show()

## Ok fine! Add all of the features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['MEDV'], test_size=0.2, random_state=42)

In [None]:
model = LinearRegression().fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
plt.plot(y_test, y_pred, '.')

# plot a line, a perfit predict would all fall on this line
x = np.linspace(0, 60, 100)
y = x
plt.plot(x, y)
plt.show()