<h2> Import Libraries</h2>

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

## Load the Data
Kaggle hosts a dataset which contains the price at which houses were sold for King County, which includes Seattle between May 2014 and May 2015.

You can download the dataset from [Kaggle](https://www.kaggle.com/harlfoxem/housesalesprediction) or load it from my [GitHub](https://raw.githubusercontent.com/mGalarnyk/Tutorial_Data/master/King_County/kingCountyHouseData.csv)

The code below loads the dataset.

In [2]:
url = 'https://raw.githubusercontent.com/mGalarnyk/Tutorial_Data/master/King_County/kingCountyHouseData.csv'
df = pd.read_csv(url)

# Selecting columns I am interested in
columns = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','price']
features = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']
df = df.loc[:, columns]

df = df.head(10)

In [3]:
df

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,price
0,3,1.0,1180,5650,1.0,221900.0
1,3,2.25,2570,7242,2.0,538000.0
2,2,1.0,770,10000,1.0,180000.0
3,4,3.0,1960,5000,1.0,604000.0
4,3,2.0,1680,8080,1.0,510000.0
5,4,4.5,5420,101930,1.0,1225000.0
6,3,2.25,1715,6819,2.0,257500.0
7,3,1.5,1060,9711,1.0,291850.0
8,3,1.0,1780,7470,1.0,229500.0
9,3,2.5,1890,6560,2.0,323000.0


In [4]:
fullDFsplit = df.copy()

In [5]:

def highlight_color(s):
    '''
    highlight the the entire dataframe cyan.
    '''

    colorDF = s.copy()

    colorDF.loc[:, ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']] = 'background-color: #FFB6C1'

    colorDF.loc[:, ['price']] = 'background-color: #FFEBCD'

    return(colorDF)


temp = df.style.apply(lambda x: highlight_color(x), axis = None)
temp.set_properties(**{'border-color': 'black',
                       'border': '1px solid black'})

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,price
0,3,1.0,1180,5650,1.0,221900.0
1,3,2.25,2570,7242,2.0,538000.0
2,2,1.0,770,10000,1.0,180000.0
3,4,3.0,1960,5000,1.0,604000.0
4,3,2.0,1680,8080,1.0,510000.0
5,4,4.5,5420,101930,1.0,1225000.0
6,3,2.25,1715,6819,2.0,257500.0
7,3,1.5,1060,9711,1.0,291850.0
8,3,1.0,1780,7470,1.0,229500.0
9,3,2.5,1890,6560,2.0,323000.0



<h2> Arrange Data into Features Matrix and Target Vector </h2>
What we are predicing is the continuous column "target" which is the median value of owner-occupied homes in $1000’s. 

In [6]:
X = df.loc[:, ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']]

In [7]:
y = df.loc[:, ['price']]

## Make Separate X and y

In [8]:
# Needed for X
def highlight_color(s):
    '''
    highlight the the entire dataframe cyan.
    '''

    colorDF = s.copy()

    colorDF.loc[:, ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']] = 'background-color: #FFB6C1'

    #colorDF.loc[:, ['price']] = 'background-color: #FFEBCD'

    return(colorDF)


temp = X.style.apply(lambda x: highlight_color(x), axis = None)
temp.set_properties(**{'border-color': 'black',
                       'border': '1px solid black'})

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors
0,3,1.0,1180,5650,1.0
1,3,2.25,2570,7242,2.0
2,2,1.0,770,10000,1.0
3,4,3.0,1960,5000,1.0
4,3,2.0,1680,8080,1.0
5,4,4.5,5420,101930,1.0
6,3,2.25,1715,6819,2.0
7,3,1.5,1060,9711,1.0
8,3,1.0,1780,7470,1.0
9,3,2.5,1890,6560,2.0


In [9]:
# Needed for y
def highlight_color(s):
    '''
    highlight the the entire dataframe cyan.
    '''

    colorDF = s.copy()

    #colorDF.loc[:, ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']] = 'background-color: #FFB6C1'

    colorDF.loc[:, ['price']] = 'background-color: #FFEBCD'

    return(colorDF)


temp = y.style.apply(lambda x: highlight_color(x), axis = None)
temp.set_properties(**{'border-color': 'black',
                       'border': '1px solid black'})

Unnamed: 0,price
0,221900.0
1,538000.0
2,180000.0
3,604000.0
4,510000.0
5,1225000.0
6,257500.0
7,291850.0
8,229500.0
9,323000.0


## Splitting Data into Training and Test Sets


In [21]:
# Original random state is 0 is nice
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, train_size = .75)

## Train Test Split Visualization

A relatively new feature of pandas is conditional formatting. https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html

In [22]:
X_train = pd.DataFrame(X_train, columns=['bedrooms','bathrooms','sqft_living','sqft_lot','floors'])

X_test = pd.DataFrame(X_test, columns=['bedrooms','bathrooms','sqft_living','sqft_lot','floors'])

In [23]:
X_train['split'] = 'train'
X_test['split'] = 'test'

In [24]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,split
0,3,1.0,1180,5650,1.0,train
7,3,1.5,1060,9711,1.0,train
2,2,1.0,770,10000,1.0,train
3,4,3.0,1960,5000,1.0,train
6,3,2.25,1715,6819,2.0,train
9,3,2.5,1890,6560,2.0,train
8,3,1.0,1780,7470,1.0,train


In [25]:
X_train['price'] = y_train
X_test['price'] = y_test

In [26]:
fullDF = pd.concat([X_train, X_test], axis = 0, ignore_index=False)

In [27]:
fullDF.head(10)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,split,price
0,3,1.0,1180,5650,1.0,train,221900.0
7,3,1.5,1060,9711,1.0,train,291850.0
2,2,1.0,770,10000,1.0,train,180000.0
3,4,3.0,1960,5000,1.0,train,604000.0
6,3,2.25,1715,6819,2.0,train,257500.0
9,3,2.5,1890,6560,2.0,train,323000.0
8,3,1.0,1780,7470,1.0,train,229500.0
4,3,2.0,1680,8080,1.0,test,510000.0
1,3,2.25,2570,7242,2.0,test,538000.0
5,4,4.5,5420,101930,1.0,test,1225000.0


In [28]:
len(fullDF.index)

10

In [29]:
len(np.unique(fullDF.index))

10

In [30]:
fullDFsplit = fullDF.copy()
fullDF = fullDF.drop(columns = ['split'])

In [32]:
def highlight_color(s, fullDFsplit):
    '''
    highlight the the entire dataframe cyan.
    '''

    colorDF = s.copy()

    # darker pink thing https://www.color-hex.com/color/ffb6c1#:~:text=%23ffb6c1%20color%20RGB%20value%20is,of%20its%20RGB%20is%20193
    colorDF.loc[fullDFsplit['split'] == 'train', ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']] = 'background-color: #40E0D0'

    
    colorDF.loc[fullDFsplit['split'] == 'test', ['bedrooms','bathrooms','sqft_living','sqft_lot','floors']] = 'background-color: #00FFFF'

    # #9370DB
    # FF D7 00
    # https://www.color-hex.com/color/ffebcd#:~:text=%23ffebcd%20color%20RGB%20value%20is,of%20its%20RGB%20is%20205.
    colorDF.loc[fullDFsplit['split'] == 'train', ['price']] = 'background-color: #FFD700' 
        
    # .35
    # EE82EE
    # BD B7 6B
    colorDF.loc[fullDFsplit['split'] == 'test', ['price']] = 'background-color: #FFFF00'
    return(colorDF)


temp = fullDF.sort_index().loc[0:9,:].style.apply(lambda x: highlight_color(x,pd.DataFrame(fullDFsplit['split'])), axis = None)
temp.set_properties(**{'border-color': 'black',
                       'border': '1px solid black'})

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,price
0,3,1.0,1180,5650,1.0,221900.0
1,3,2.25,2570,7242,2.0,538000.0
2,2,1.0,770,10000,1.0,180000.0
3,4,3.0,1960,5000,1.0,604000.0
4,3,2.0,1680,8080,1.0,510000.0
5,4,4.5,5420,101930,1.0,1225000.0
6,3,2.25,1715,6819,2.0,257500.0
7,3,1.5,1060,9711,1.0,291850.0
8,3,1.0,1780,7470,1.0,229500.0
9,3,2.5,1890,6560,2.0,323000.0


In [33]:
# Train test split key
temp = pd.DataFrame(data = [['X_train','X_test','y_train','y_test']]).T
temp

Unnamed: 0,0
0,X_train
1,X_test
2,y_train
3,y_test


In [34]:
def highlight_mini(s):
    '''
    highlight the the entire dataframe cyan.
    '''

    colorDF = s.copy()

    # colorDF.loc[0, [0]] = 'background-color: #40E0D0'
    
    # train features
    colorDF.loc[0, [0]] = 'background-color: #e5a3ad'

    # test features
    colorDF.loc[1, [0]] = 'background-color: #ffcbd3'

    # train target
    colorDF.loc[2, [0]] = 'background-color: #e5d3b8'

    # test target
    colorDF.loc[3, [0]] = 'background-color: #fff1dc'

    return(colorDF)
df.style.hide_index()

temp2 = temp.sort_index().style.hide_index().apply(lambda x: highlight_mini(x), axis = None)

temp2.apply(lambda x: highlight_mini(x), axis = None)
temp2.set_properties(**{'border-color': 'black',
                       'border': '1px solid black',
                       })
temp2

0
X_train
X_test
y_train
y_test


After that I was lazy and used powerpoint to combine the train and test