In [6]:
# %pip install pandas
# %pip install qgrid
#%pip install scikit-learn
#%pip install tensorflow

Analyse spinney green house prices

In [7]:
def preprocess_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    houses_data = content.split('\n\n')
    
    # Clean up data
    houses_data = [house.replace('\n', ' ').replace('Â£', '£') for house in houses_data]
    
    return houses_data

import re
import pandas as pd


def parse_house_data(data):
    house_list = []
    for house in data:
        house_data = {}
        
        # Address
        address_parts = re.match(r'^(.*), (.*), (.*), (\w{2,4} \d{1,2}\w{0,2})', house)
        if address_parts:
            house_data['street'] = address_parts.group(1)
            house_data['area'] = address_parts.group(2)
            house_data['town'] = address_parts.group(3)
            house_data['postCode'] = address_parts.group(4)

        # Baths, beds, receptions
        for pattern, key in [(r'(\d+) baths', 'bathrooms'), (r'(\d+) beds', 'bedrooms'), (r'(\d+) receptions', 'receptions')]:
            match = re.search(pattern, house)
            house_data[key] = int(match.group(1)) if match else None

        # Lease, type
        match = re.search(r'(FREEHOLD|LEASEHOLD), (\w+ \w+|\w+)', house)
        if match:
            house_data['lease'] = match.group(1)
            house_data['type'] = match.group(2)

        # Last sold
        match = re.search(r'Last sold: (\w{3} \d{4}), £([\d,]+)', house)
        if match:
            house_data['lastSoldDate'] = match.group(1)
            house_data['lastSoldPrice'] = int(match.group(2).replace(',', ''))

        # Estimated price
        match = re.search(r'Estimated price: £([\d,]+) - £([\d,]+)', house)
        if match:
            house_data['estimatedPriceRange'] = [int(match.group(1).replace(',', '')), int(match.group(2).replace(',', ''))]
    
        house_list.append(house_data)

    return pd.DataFrame(house_list)


In [8]:
# Usage
file_path = 'spinney.txt'
data = preprocess_data(file_path)
print(data)
df = parse_house_data(data)
df.head(50)
# print(df)



# Save DataFrame to CSV
# df.to_csv('house_data.csv', index=False)

['25 Spinney Green, Eccleston, St. Helens, WA10 5AH 2 baths, 4 beds, 2 receptions FREEHOLD, DETACHED BUNGALOW Last sold: Oct 2020, £332,000 Estimated price: £403,000 - £446,000', '20 Spinney Green, Eccleston, St. Helens, WA10 5AH 2 baths, 4 beds, 2 receptions LEASEHOLD, DETACHED HOUSE Last sold: Oct 2020, £397,500 Estimated price: £471,000 - £520,000', '34 Spinney Green, Eccleston, St. Helens, WA10 5AH 2 baths, 4 beds, 3 receptions LEASEHOLD, DETACHED HOUSE Last sold: Sep 2019, £355,000 Estimated price: £435,000 - £481,000', '3 Spinney Green, Eccleston, St. Helens, WA10 5AH FREEHOLD, DETACHED BUNGALOW Last sold: May 2018, £245,000 Estimated price: £332,000 - £367,000', '15 Spinney Green, Eccleston, St. Helens, WA10 5AH 1 bath, 3 beds LEASEHOLD, DETACHED BUNGALOW Last sold: Apr 2018, £285,000 Estimated price: £334,000 - £408,000', '6 Spinney Green, Eccleston, St. Helens, WA10 5AH LEASEHOLD, DETACHED BUNGALOW Last sold: Jul 2017, £230,000 Estimated price: £301,000 - £368,000', '1 Spinney

Unnamed: 0,street,area,town,postCode,bathrooms,bedrooms,receptions,lease,type,lastSoldDate,lastSoldPrice,estimatedPriceRange
0,25 Spinney Green,Eccleston,St. Helens,WA10 5AH,2.0,4.0,2.0,FREEHOLD,DETACHED BUNGALOW,Oct 2020,332000.0,"[403000, 446000]"
1,20 Spinney Green,Eccleston,St. Helens,WA10 5AH,2.0,4.0,2.0,LEASEHOLD,DETACHED HOUSE,Oct 2020,397500.0,"[471000, 520000]"
2,34 Spinney Green,Eccleston,St. Helens,WA10 5AH,2.0,4.0,3.0,LEASEHOLD,DETACHED HOUSE,Sep 2019,355000.0,"[435000, 481000]"
3,3 Spinney Green,Eccleston,St. Helens,WA10 5AH,,,,FREEHOLD,DETACHED BUNGALOW,May 2018,245000.0,"[332000, 367000]"
4,15 Spinney Green,Eccleston,St. Helens,WA10 5AH,,3.0,,LEASEHOLD,DETACHED BUNGALOW,Apr 2018,285000.0,"[334000, 408000]"
5,6 Spinney Green,Eccleston,St. Helens,WA10 5AH,,,,LEASEHOLD,DETACHED BUNGALOW,Jul 2017,230000.0,"[301000, 368000]"
6,1 Spinney Green,Eccleston,St. Helens,WA10 5AH,,,,LEASEHOLD,DETACHED BUNGALOW,May 2017,240000.0,"[320000, 353000]"
7,7 Spinney Green,Eccleston,St. Helens,WA10 5AH,,4.0,2.0,FREEHOLD,DETACHED HOUSE,Apr 2016,308000.0,"[395000, 436000]"
8,19 Spinney Green,Eccleston,St. Helens,WA10 5AH,,3.0,,FREEHOLD,DETACHED BUNGALOW,Mar 2016,255000.0,"[309000, 378000]"
9,5 Spinney Green,Eccleston,St. Helens,WA10 5AH,,3.0,,LEASEHOLD,DETACHED BUNGALOW,Jun 2013,175000.0,"[284000, 347000]"


In [None]:
df.head(10)

In [None]:
import qgrid

# Display the DataFrame using qgrid
qgrid_widget = qgrid.show_grid(df)
qgrid_widget

In [None]:
import pandas as pd

# Replace 'your_file_path.csv' with the path to your CSV file
csv_file_path = 'house_data.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Display the first 5 rows of the DataFrame
print(df.head())

In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load and preprocess data
data = pd.read_csv('house_data.csv')

print(data.head(10))

# Separate features and target variable
X = data.drop('Avg', axis=1)
y = data['Avg']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess categorical and numerical features
#categorical_features = ['street', 'area', 'town', 'postCode', 'lease', 'type']
categorical_features = ['type','location']
numerical_features = ['houseNumber','bathrooms', 'bedrooms', 'receptions']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_features),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)])

# Preprocess the training and testing data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Normalize the target variable
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

# Build the neural network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train_preprocessed.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(X_train_preprocessed, y_train_scaled, epochs=200, validation_split=0.1, verbose=1)

# Evaluate the model
loss, mae = model.evaluate(X_test_preprocessed, y_test_scaled)
print(f'Mean Absolute Error: {mae}')


   houseNumber         street       area        town  postCode  bathrooms  \
0           50  Spinney Green  Eccleston  St. Helens  WA10 5AH          2   
1           17  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   
2           12  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   
3            4  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   
4           13  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   
5           44  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   
6           23  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   
7            5  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   
8           19  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   
9            7  Spinney Green  Eccleston  St. Helens  WA10 5AH          1   

   bedrooms  receptions  location      lease               type  yearSold  \
0         4           2         1   FREEHOLD     DETACHED HOUSE    1995.0  

In [15]:
# Assuming you have new_data as a DataFrame with the same columns as the original data
new_data = pd.read_csv('predictSet.csv')
print(new_data.head())
# new_data = pd.DataFrame({...})

# Preprocess the new data using the preprocessor
new_data_preprocessed = preprocessor.transform(new_data)

# Make predictions using the model
predictions_scaled = model.predict(new_data_preprocessed)

# Inverse-transform the predictions to the original scale
predictions = scaler.inverse_transform(predictions_scaled)

# Now, predictions will contain the estimated prices for the new data
print (predictions)

   houseNumber         street       area        town  postCode  bathrooms  \
0           50  Spinney Green  Eccleston  St. Helens  WA10 5AH          2   

   bedrooms  receptions  location     lease            type  yearSold  \
0         4           2         1  FREEHOLD  DETACHED HOUSE      1995   

   MonthSold  lastSoldPrice  lowEstimate  highEstimate  Avg  
0         11          98999       430000        525000    0  
[[483876.94]]
