# Dissertation 

In [62]:
import warnings
import pandas as pd
import matplotlib.pyplot as plt
# Temporarily ignore all warnings
warnings.filterwarnings("ignore")
import pandas as pd

def reshape_region_data(df, region, tuned, trained_on):
    # Filter the dataframe by the given 'Trained on' criteria
    df_filtered = df[df['Trained on'] == trained_on]
    
    # Create a new dataframe with the required format
    reshaped = {
        "Region": region,
        "DT": df_filtered[df_filtered['Model'] == 'DT']['Accuracy 10% Pred'].values[0] if 'DT' in df_filtered['Model'].values else None,
        "LR": df_filtered[df_filtered['Model'] == 'LR']['Accuracy 10% Pred'].values[0] if 'LR' in df_filtered['Model'].values else None,
        "XGBoost": df_filtered[df_filtered['Model'] == 'XGBoost']['Accuracy 10% Pred'].values[0] if 'XGBoost' in df_filtered['Model'].values else None,
        "LightGBM": df_filtered[df_filtered['Model'] == 'LightGBM']['Accuracy 10% Pred'].values[0] if 'LightGBM' in df_filtered['Model'].values else None,
        "RNN": df_filtered[df_filtered['Model'] == 'RNN']['Accuracy 10% Pred'].values[0] if 'RNN' in df_filtered['Model'].values else None,
        "Trained on": trained_on,
        "Tuned": "Yes" if tuned else "No"
    }
    
    return pd.DataFrame([reshaped])

def create_summary_tables(tuned=False):
    # Paths
    tuned_path = "not_tuned" if not tuned else "tuned"
    data_path = f"../results/results_{tuned_path}.xlsx"

    # Regions to consider
    regions = ["UK", "LONDON", "ESHER", "WOODSTOCK", "SOUTH SHIELDS"]
    
    # Output dataframe
    results_df_uk = pd.DataFrame()
    results_df_regions = pd.DataFrame()

    # Iterate through each region and extract relevant data
    for region in regions:
        df = pd.read_excel(data_path, sheet_name=region)
        
        # Data trained on UK
        region_data_uk = reshape_region_data(df, region, tuned, 'UK')
        results_df_uk = pd.concat([results_df_uk, region_data_uk])
        
        # Data trained on respective region
        if region != "UK":
            region_data_region = reshape_region_data(df, region, tuned, region)
            results_df_regions = pd.concat([results_df_regions, region_data_region])

    # Save to Excel
    with pd.ExcelWriter("../results/summary_results.xlsx", mode='a' if tuned else 'w') as writer: 
        results_df_uk.to_excel(writer, sheet_name=f"Trained on UK {'Tuned' if tuned else 'Not Tuned'}", index=False)
        results_df_regions.to_excel(writer, sheet_name=f"Trained on Regions {'Tuned' if tuned else 'Not Tuned'}", index=False)

# Run the functions
create_summary_tables(tuned=False)
create_summary_tables(tuned=True)


In [63]:
import numpy as np

def combine_and_sort_data(tuned=False):
    # Paths
    tuned_path = "not_tuned" if not tuned else "tuned"
    data_path = f"../results/results_{tuned_path}.xlsx"
    
    # Regions
    regions = ["UK", "LONDON", "ESHER", "WOODSTOCK", "SOUTH SHIELDS"]

    # Output dataframe
    combined_results = pd.DataFrame()

    # Extract and combine data
    for region in regions:
        df = pd.read_excel(data_path, sheet_name=region)
        
        # For each region, add both the models trained on UK and trained on the region itself
        combined_results = pd.concat([combined_results, reshape_region_data(df, region, tuned, 'UK')])
        if region != "UK":
            combined_results = pd.concat([combined_results, reshape_region_data(df, region, tuned, region)])

    # Determine which model has the highest accuracy across the board
    model_columns = ["DT", "LR", "XGBoost", "LightGBM", "RNN"]
    max_model = combined_results[model_columns].mean().idxmax()

    # Sort by that model's accuracy
    combined_results = combined_results.sort_values(by=max_model, ascending=False)

    return combined_results

def save_combined_results():
    combined_not_tuned = combine_and_sort_data(tuned=False)
    combined_tuned = combine_and_sort_data(tuned=True)

    # Save to Excel
    with pd.ExcelWriter("../results/summary_results.xlsx", mode='a') as writer: 
        combined_not_tuned.to_excel(writer, sheet_name="Combined Not Tuned", index=False)
        combined_tuned.to_excel(writer, sheet_name="Combined Tuned", index=False)

# Run the function
save_combined_results()

In [117]:
def load_summary_data():
    # Path to the Excel file
    summary_path = "../results/summary_results.xlsx"
    
    # Load the sheets into dataframes
    df_not_tuned = pd.read_excel(summary_path, sheet_name="Combined Not Tuned")
    df_tuned = pd.read_excel(summary_path, sheet_name="Combined Tuned")
    
    return df_not_tuned, df_tuned

# Call the function
df_not_tuned, df_tuned = load_summary_data()

# Display the dataframes (for verification)
# print("Not Tuned Dataframe:")
# print(df_not_tuned)

# print("\nTuned Dataframe:")
# print(df_tuned)

In [73]:
df_not_tuned

Unnamed: 0,Region,DT,LR,XGBoost,LightGBM,RNN,Trained on,Tuned
0,ESHER,35.75419,24.581006,20.111732,16.759777,1.796407,ESHER,No
1,ESHER,34.078212,4.469274,13.407821,17.877095,2.234637,UK,No
2,LONDON,29.423224,15.675224,17.111946,11.660753,15.159741,LONDON,No
3,SOUTH SHIELDS,28.253968,21.746032,25.873016,23.333333,4.045307,SOUTH SHIELDS,No
4,WOODSTOCK,28.070175,5.263158,8.77193,15.789474,7.017544,UK,No
5,UK,27.25914,13.79329,19.730304,21.434742,11.670945,UK,No
6,LONDON,26.758506,8.609894,15.292562,16.927573,2.974327,UK,No
7,WOODSTOCK,26.315789,7.017544,24.561404,19.298246,2.222222,WOODSTOCK,No
8,SOUTH SHIELDS,24.444444,5.238095,13.968254,18.095238,5.555556,UK,No


In [139]:
from matplotlib.colors import LinearSegmentedColormap
from IPython.core.display import display, HTML


class DataFrameStyler:
    def __init__(self, df, gradient_colors=None, trained_on_colors=None):
        self.df = df.copy()
        self.gradient_colors = gradient_colors or ["#e06666", "#93c47d"]
        self.trained_on_colors = trained_on_colors or {
            'England': '#cfe2f3',
            'Others': '#fce5cd'   # Default color
        }

    def setup_dataframe(self):
        self.df.replace('UK', 'England', inplace=True)
        self.df['Region'] = self.df['Region'].str.title()
        self.df['Trained on'] = self.df['Trained on'].str.title()
        if 'Tuned' in self.df.columns:
            self.df.drop(columns='Tuned', inplace=True)

    def ensure_numeric_columns(self, cols):
        for col in cols:
            if self.df[col].dtype == 'object':
                self.df[col] = self.df[col].str.replace('%', '').astype(float)

    def apply_styling(self, num_cols):
        def highlight_trained_on(row):
            styles = {col: '' for col in row.index}
            color = self.trained_on_colors[row['Trained on']] if row['Trained on'] in self.trained_on_colors else self.trained_on_colors['Others']
            styles['Region'] = f'background-color: {color}'
            styles['Trained on'] = f'background-color: {color}'
            return [styles[col] for col in row.index]

        align_center = {
            "selector": "th, td",
            "props": [("text-align", "center")]
        }

        cmap = LinearSegmentedColormap.from_list("custom", self.gradient_colors)

        styled = (self.df.style.background_gradient(cmap=cmap, subset=num_cols)
                  .apply(highlight_trained_on, axis=1)
                  .set_table_styles([align_center])
                  .format("{:.2f}%", subset=num_cols)
                  .hide_index())  # Hiding the index

        return styled.render()

    def display_styled_df(self, numerical_columns, display_output=True):
        self.setup_dataframe()
        self.ensure_numeric_columns(numerical_columns)
        
        styled_html = self.apply_styling(numerical_columns)
        
        legend_html = """
        <table style="border:0px; margin-left:20px;">
            <tr><td style="background-color: {0}; width: 30px;"></td><td style="border:0px;">Trained on England</td></tr>
            <tr><td style="background-color: {1}; width: 30px;"></td><td style="border:0px;">Trained on Others</td></tr>
            <tr><td colspan="2" style="border:0px;"><br></td></tr>
            <tr><td style="background-color: {2}; width: 30px;"></td><td style="border:0px;">Lower Value</td></tr>
            <tr><td style="background-color: {3}; width: 30px;"></td><td style="border:0px;">Higher Value</td></tr>
        </table>
        """.format(self.trained_on_colors['England'], self.trained_on_colors['Others'], self.gradient_colors[0], self.gradient_colors[1])
        
        combined_html = f'<div style="display:flex; direction:row;">{styled_html}{legend_html}</div>'
        
        if display_output:
            display(HTML(combined_html))
        else:
            return combined_html

    def save_to_html(self, filename, numerical_columns):
        combined_html = self.display_styled_df(numerical_columns, display_output=False)
        
        # Define column widths
        col_widths = {
            "Region": "10%",
            "DT": "12%",
            "LR": "12%",
            "XGBoost": "12%",
            "LightGBM": "12%",
            "RNN": "12%",
            "Trained on": "10%"
        }

        for col, width in col_widths.items():
            combined_html = combined_html.replace(f'<th class="col_heading level0 col0">{col}</th>',
                                                f'<th style="width:{width}" class="col_heading level0 col0">{col}</th>')

        with open(filename, 'w', encoding="utf-8") as f:
            f.write(combined_html)

In [10]:
from housedatautils import ModelAnalysis, RegionAccuracyPlotter

In [8]:
analysis = ModelAnalysis()
analysis.load_summary_data()
df_not_tuned, df_tuned = analysis.df_not_tuned, analysis.df_tuned

In [11]:
# Usage:
styler = DataFrameStyler(df_not_tuned)
styler.display_styled_df(['DT', 'LR', 'XGBoost', 'LightGBM', 'RNN'])

styler = DataFrameStyler(df_tuned)
styler.display_styled_df(['DT', 'LR', 'XGBoost', 'LightGBM', 'RNN'])

Region,DT,LR,XGBoost,LightGBM,RNN,Trained on
Esher,35.75%,24.58%,20.11%,16.76%,1.80%,Esher
Esher,34.08%,4.47%,13.41%,17.88%,2.23%,England
London,29.42%,15.68%,17.11%,11.66%,15.16%,London
South Shields,28.25%,21.75%,25.87%,23.33%,4.05%,South Shields
Woodstock,28.07%,5.26%,8.77%,15.79%,7.02%,England
England,27.26%,13.79%,19.73%,21.43%,11.67%,England
London,26.76%,8.61%,15.29%,16.93%,2.97%,England
Woodstock,26.32%,7.02%,24.56%,19.30%,2.22%,Woodstock
South Shields,24.44%,5.24%,13.97%,18.10%,5.56%,England

0,1
,Trained on England
,Trained on Others
,
,Lower Value
,Higher Value


Region,DT,LR,XGBoost,LightGBM,RNN,Trained on
London,29.78%,15.68%,18.09%,11.66%,15.11%,London
Esher,29.61%,24.58%,19.55%,16.76%,4.19%,Esher
South Shields,28.73%,21.75%,24.13%,23.33%,10.03%,South Shields
England,28.27%,13.79%,22.62%,21.43%,11.78%,England
London,28.16%,8.61%,17.02%,16.93%,2.80%,England
Esher,27.93%,4.47%,15.64%,17.88%,2.23%,England
Woodstock,26.32%,7.02%,31.58%,19.30%,8.89%,Woodstock
South Shields,26.19%,5.24%,13.81%,18.10%,5.71%,England
Woodstock,21.05%,5.26%,10.53%,15.79%,7.02%,England

0,1
,Trained on England
,Trained on Others
,
,Lower Value
,Higher Value
