In [2]:
import pandas as pd
import numpy as np
from scipy.stats import iqr

from ppc_db import sql_extractdata

db_args = {
    'user':'root',
    'password':'password',
    'host':'localhost',
    'port':3306,
    'database':'CSV_DB 7'
 }

In [22]:
cols = ['7_county', 'price_gbp']

query = f"SELECT {", ".join(str(x) for x in cols)} FROM Oct24pricepaidcomplete WHERE YEAR(date)=2023"
result = pd.DataFrame(sql_extractdata(query, **db_args),columns=cols)

df = (result.join
      (pd.read_csv('county_to_region.csv',
                   index_col='7_county',
                   dtype=str),
                   on='7_county')
     )

In [23]:
import plotly.express as px
fig = px.histogram(df, x="price_gbp",color='region',nbins=2000)
fig.update_layout(xaxis_range=[0,1e6],barmode='overlay')
fig.show()

In [24]:
import plotly.graph_objects as go

df_by_region = df.groupby('region')['price_gbp']


fig = px.bar(df.groupby('region')['price_gbp'].median())
fig = go.Figure(
    data=[
        go.Bar(
            x=df_by_region.median().index,
            y=df_by_region.median().values,
            name="Median",
            marker=dict(color="paleturquoise"),
        ),
    go.Bar(
        x=df_by_region.agg(iqr).index,
        y=df_by_region.agg(iqr).values,
        name="IQR",
        marker=dict(color="crimson"),
    )
    ]
)

fig.update_layout(
    legend=dict(orientation="v"),
    yaxis=dict(
        title=dict(text="Median, GBP"),
        side="left",
    ),
    yaxis2=dict(
        title=dict(text="Skew"),
        side="right",
        overlaying="y",
        tickmode="sync",
    ),
    barmode='group'
)

fig.show()

In [37]:
cols = ['YEAR(date)', '7_county', 'COUNT(`tx_id`)']

query = f"SELECT {", ".join(str(x) for x in cols)} FROM Oct24pricepaidcomplete WHERE MONTH(date) < 4 GROUP BY YEAR(date), 7_county"
result = pd.DataFrame(sql_extractdata(query, **db_args),columns=cols)

df = (result.join
      (pd.read_csv('county_to_region.csv',
                   index_col='7_county',
                   dtype=str),
                   on='7_county')
     )

df_midyr = df.groupby(['YEAR(date)','region']).sum().drop('7_county',axis=1).unstack()['COUNT(`tx_id`)']
df_midyr = df_midyr[df_midyr.index>2007].apply(lambda x:x/x.iloc[0])

In [39]:
df_midyr

region,East Midlands,East of England,London,North East,North West,South East,South West,Wales,West Midlands,Yorkshire and the Humber
YEAR(date),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2008,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2009,0.51857,0.538153,0.420263,0.468092,0.46018,0.550236,0.597429,0.546994,0.509623,0.496815
2010,0.704535,0.78575,0.781234,0.556348,0.601126,0.795565,0.810061,0.701097,0.659758,0.616974
2011,0.663092,0.754807,0.721982,0.57909,0.593463,0.74093,0.758077,0.706465,0.65322,0.591381
2012,0.798544,0.929501,0.87557,0.648789,0.670562,0.920289,0.969033,0.775406,0.794288,0.70649
2013,0.767627,0.865311,0.829883,0.623671,0.66328,0.852407,0.877641,0.773736,0.761658,0.700407
2014,1.170871,1.267474,1.191089,0.944558,1.019518,1.250832,1.306149,1.13812,1.121823,1.042061
2015,1.097319,1.174645,1.068686,0.920457,1.007325,1.161696,1.21526,1.116889,1.096965,0.985195
2016,1.612512,1.67109,1.541379,1.210342,1.453872,1.64237,1.821017,1.554389,1.581359,1.391175
2017,1.335386,1.322952,1.08806,1.173795,1.284093,1.27179,1.471157,1.421875,1.361769,1.27509


In [34]:
cols = ['YEAR(date)', '7_county', 'COUNT(`tx_id`)']

query = f"SELECT {", ".join(str(x) for x in cols)} FROM Oct24pricepaidcomplete GROUP BY YEAR(date), 7_county"
result = pd.DataFrame(sql_extractdata(query, **db_args),columns=cols)

df = (result.join
      (pd.read_csv('county_to_region.csv',
                   index_col='7_county',
                   dtype=str),
                   on='7_county')
     )

df_fullyear = df.groupby(['YEAR(date)','region']).sum().drop('7_county',axis=1).unstack()['COUNT(`tx_id`)']
df_fullyear = df_fullyear[df_fullyear.index>2007].apply(lambda x:x/x.iloc[0])

In [35]:
df_fullyear = df_fullyear[df_fullyear.index<2024]

In [36]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=5,cols=2,subplot_titles=df_midyr.columns)

for index,value in enumerate(df_midyr.columns):
    row = [1,1,2,2,3,3,4,4,5,5]
    col = [1,2,1,2,1,2,1,2,1,2]
    fig.add_trace(go.Scatter(x=df_fullyear[value].index,y=df_fullyear[value].values),row=row[index],col=col[index])

fig.update_layout(height=1000, width=800, showlegend=False,
                  title_text="Number of Transactions by Year, Full Year")

fig.show()

In [40]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=5,cols=2,subplot_titles=df_midyr.columns)

for index,value in enumerate(df_midyr.columns):
    row = [1,1,2,2,3,3,4,4,5,5]
    col = [1,2,1,2,1,2,1,2,1,2]
    fig.add_trace(go.Scatter(x=df_midyr[value].index,y=df_midyr[value].values),row=row[index],col=col[index])

fig.update_layout(height=1000, width=800, showlegend=False,
                  title_text="Number of Transactions by Year, 1Q")

fig.show()

In [18]:
import datetime

cols = ['MONTH(date)','YEAR(date)', '7_county', 'COUNT(`tx_id`)']

query = f"SELECT {", ".join(str(x) for x in cols)} FROM Oct24pricepaidcomplete WHERE MONTH(date) < 4 GROUP BY MONTH(date),YEAR(date), 7_county"
result = pd.DataFrame(sql_extractdata(query, **db_args),columns=cols)

df = (result.join
      (pd.read_csv('county_to_region.csv',
                   index_col='7_county',
                   dtype=str),
                   on='7_county')
     )

df = df[df['YEAR(date)'].isin([2007,2008,2009,2010,2014,2023,2024])]
df['date']=df.apply(lambda x: datetime.datetime(x['YEAR(date)'],x['MONTH(date)'],1),axis=1)
df = df.groupby(['date','region']).sum().drop('7_county',axis=1).unstack()['COUNT(`tx_id`)']


In [11]:
fig = px.line(x=df.index.month,y=df['London'], color=df.index.year)
fig.show()

In [20]:
range_list = [str(i)+"-"+str(i+249999) for i in range(0,3500000,250000)]
cols=['Year','7_county',*range_list]

count_strings = ["count(case when `price_gbp` between "+str(i) +" and "+ str(i+249999) +" then 1 end) `bucket("+str(i)+"-"+str(i+249999)+"`" for i in range(0,3500000,250000)]

sql_cols = ['YEAR(date)', '7_county']+count_strings

query = f"SELECT {", ".join(str(x) for x in sql_cols)} FROM Oct24pricepaidcomplete WHERE MONTH(date) < 4 GROUP BY YEAR(date), 7_county"
result = pd.DataFrame(sql_extractdata(query, **db_args),columns=cols)

df = (result.join
      (pd.read_csv('county_to_region.csv',
                   index_col='7_county',
                   dtype=str),
                   on='7_county')
     )


In [13]:
df.drop('7_county',axis=1,inplace=True)

In [14]:
df_Lon = df.loc[df.region=='London'].drop('region',axis=1).set_index('Year')
df_SE = df.loc[df.region=='South East'].drop('region',axis=1).set_index('Year')
df_SE = df_SE.groupby(df_SE.index).sum()

In [15]:
fig = px.line(df_Lon[df_Lon.index>2018].apply(lambda x:x/x.iloc[0]).transpose())
fig.show()

In [16]:
fig = px.line(df_SE[df_SE.index>2018].apply(lambda x:x/x.iloc[0]).transpose())
fig.show()