In [1]:
import time
from datetime import date
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

#import seaborn as sns
import mpl_toolkits
%matplotlib inline
from sklearn.model_selection import train_test_split
from pandas.tools.plotting import lag_plot
from sklearn.linear_model import LinearRegression, LassoLars, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score   # accuracy metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestRegressor

In [2]:
#extract the data from hm land registry using the selection criteria

col_names = ['tui', 'price', 'txn_date', 'post_code', 'type', 'is_new','DURATION(F/L)','PAON', 'SAON', 'STREET',
            'LOCALITY', 'town', 'district', 'county', 'cat', 'record_status']

#read all data files into a single dataframe
path = r'C:\Users\rajgu\OneDrive\Documents\step_function\house-price-prediction\data'
path_outcode = r'C:\Users\rajgu\OneDrive\Documents\step_function\house-price-prediction\outcodes.csv'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
t1 = time.time()
for file_ in allFiles:
    df = pd.read_csv(file_,names = col_names)
    list_.append(df)
ppd_df = pd.concat(list_)
print(time.time() - t1)

filtered_df = ppd_df[(ppd_df.county == 'GREATER LONDON') & (ppd_df.type == 'T') & (ppd_df.cat == 'A') & (ppd_df.price < 5000000)
                    & (ppd_df.price > 0)]

#get the outcode df
df_outcodes = pd.read_csv(path_outcode)
#ppd_df.head()

96.9557671546936


In [3]:
#utilities

def postcode_to_outcode(post_code):
    try:
        return post_code.split(' ')[0]
    except AttributeError:
        print(post_code)

#def distance_in_km(lat, lon, reflat = , reflat = 51.507602, reflon = -0.127816):
def outcode_to_distance(row, ref_lat=51.507602, ref_lon=-0.127816 ):
    lat = row['latitude']
    lon = row['longitude']
    return haversine(lon, lat, ref_lon, ref_lat)
    
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

def type_to_numerical(my_type):
    value = 0
    if(my_type == 'D'):
        value = 1
    elif(my_type == 'S'):
        value = 2
    elif(my_type == 'T'):
        value = 3
    elif (my_type == 'F'):
        value = 4
    return value
        
        
def price_in_000(price):
    return price/1000.0

def mth_yr(date):
    split = date.split('-')
    return (split[0] + split[1])

def mth_yr1(my_date):
    split = my_date.split('-')
    new_date = date(int(split[0]), int(split[1]), 1)
    return new_date

def date_diff_days(earlier_date, later_date):
    date_diff = abs(later_date - earlier_date)
    return date_diff.days

def date_diff_days1(my_date):
    ref_date = date(2003, 12, 1)
    date_diff = abs(my_date - ref_date)
    return date_diff.days

def log_series(series):
    return np.log(series)

def pct_chg(series):
    return tuple(pd.Series.pct_change(series).values.tolist())

def is_outlier(points, thresh=3.5):
    """
    Returns a boolean array with True if points are outliers and False 
    otherwise.

    Parameters:
    -----------
        points : An numobservations by numdimensions array of observations
        thresh : The modified z-score to use as a threshold. Observations with
            a modified z-score (based on the median absolute deviation) greater
            than this value will be classified as outliers.

    Returns:
    --------
        mask : A numobservations-length boolean array.

    References:
    ----------
        Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
        Handle Outliers", The ASQC Basic References in Quality Control:
        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. 
    """
    
    points = np.array(points)
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

In [4]:
#engineer the dataset to meet our requirements

filtered_df['out_code'] = filtered_df['post_code'].apply(postcode_to_outcode)
filtered_df['price_scaled'] = filtered_df['price'].apply(price_in_000)
filtered_df['txn_mth'] = filtered_df['txn_date'].apply(mth_yr1)
filtered_df['txn_days'] = filtered_df['txn_mth'].apply(date_diff_days1)
filtered_df['type_num'] = filtered_df['type'].apply(type_to_numerical)

df_outcodes = df_outcodes.rename(columns = {'postcode':'out_code'})
merged_df = pd.merge(filtered_df, df_outcodes, on='out_code')

merged_df['distance'] = merged_df.apply(outcode_to_distance, axis=1)
merged_df = merged_df[merged_df.distance < 100]

#dummy df for new builds
dummy_df = pd.get_dummies(merged_df['is_new'])
merged_df = pd.concat([merged_df, dummy_df], axis=1)
merged_df = merged_df.rename(columns = {'Y':'d_newbuild', 'N':'d_old'})

trunc_df = merged_df[['txn_days', 'distance', 'd_newbuild', 'd_old','price_scaled']]
sorted_df = trunc_df.sort_values('txn_days') 


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-

In [5]:
#validation and overall accuracy comparison

X = np.array(sorted_df[['txn_days','distance']])
y = np.array(sorted_df['price_scaled'].apply(log_series))

#define the training, validation and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

print('Model training and validation')
print(X_train.shape, X_valid.shape)

#test the linear regression model
print("Linear Regression Model")
model = LinearRegression()
model.fit(X_train, y_train)
print(model.coef_, model.intercept_)
print(model.score(X_valid, y_valid))

#test the Ridge regression model
print("Ridge Regression")
model_ridge = Ridge(alpha = 0.5)
model_ridge.fit(X_train, y_train)
print(model_ridge.coef_, model_ridge.intercept_)
print(model_ridge.score(X_valid, y_valid))

#decision trees
print("Decision Tree Regression")
model_dt = DecisionTreeRegressor(max_depth=4)
model_dt.fit(X_train, y_train)
print(model_dt.score(X_valid, y_valid))

#random forest
print("Random Forest Regression")
regr = RandomForestRegressor(n_estimators=150, min_samples_split=4, min_samples_leaf=10)
regr.fit(X_train, y_train)
print(regr.feature_importances_)
print(regr.score(X_valid, y_valid))

Model training and validation
(279844, 2) (69961, 2)
Linear Regression Model
[ 0.00015657 -0.06480541] 6.33418504399
0.455937619182
Ridge Regression
[ 0.00015657 -0.0648054 ] 6.33418498725
0.45593761942
Decision Tree Regression
0.535295119302
Random Forest Regression
[ 0.2404924  0.7595076]
0.738208795518


In [6]:
print('Model testing')

print(X_test.shape)

print("Linear Regression Model")
print(model.score(X_test, y_test))

#test the Ridge regression model
print("Ridge Regression")
print(model_ridge.score(X_test, y_test))

#decision trees
print("Decision Tree Regression")
print(model_dt.score(X_test, y_test))

#random forest
print("Random Forest Regression")
print(regr.score(X_test, y_test))

Model testing
(87452, 2)
Linear Regression Model
0.460358359104
Ridge Regression
0.460358359196
Decision Tree Regression
0.543745696964
Random Forest Regression
0.744956534008
