# Coursework Assignment: Building a Regression Model

```
University of London
BSc in Computer Science
CM3005, Data Science
Hudson Leonardo MENDES
hlm12@student.london.ac.uk
```


# I. Introduction


## Domain-specific area


## Dataset


## Objectives


# II. Implementation


## Preprocessing


In [None]:
import pathlib

data_folderpath = pathlib.Path("./data")

ppd_folderpath = data_folderpath / "uk-ppd"
inflation_filepath = data_folderpath / "uk-ons/ons-inflation-1989-2022.csv"
interest_filepath = data_folderpath / "uk-boe/boe-interest-1975-2022.csv"


In [None]:
import pandas as pd

pd.set_option("display.float_format", lambda x: "{:,.3f}".format(x))


In [None]:
# https://www.gov.uk/guidance/about-the-price-paid-data
ppd_property_type = {
    "D": "detached",
    "S": "semi-detached",
    "T": "terraced",
    "F": "flat/maisonettes",
    # "O": "other" # => intentionally ommitted
}

ppd_duration = {"F": "freehold", "L": "leasehold"}

ppd_old_or_new = {"Y": "new", "N": "old"}

ppd_df = pd.concat(
    [
        pd.read_csv(
            ppd_filepath,
            compression="zip",
            names=[
                "id",
                "price",
                "date",
                "postcode",
                "property_type",
                "old_or_new",
                "duration",
                "paon",
                "saon",
                "street",
                "locality",
                "town_city",
                "district",
                "county",
                "ppd_category_type",
                "record_status",
            ],
        )
        for ppd_filepath in ppd_folderpath.glob("*.zip")
    ]
)
ppd_df["postgroup"] = ppd_df["postcode"].map(lambda x: str(x).split(" ")[0])
ppd_df["date"] = pd.to_datetime(ppd_df["date"])
ppd_df["property_type"] = ppd_df["property_type"].map(ppd_property_type.get)
ppd_df["duration"] = ppd_df["duration"].map(ppd_duration.get)
ppd_df["old_or_new"] = ppd_df["old_or_new"].map(ppd_old_or_new.get)
ppd_df = ppd_df[
    [
        "date",
        "postgroup",
        "property_type",
        "old_or_new",
        "duration",
        "price",
    ]
]
ppd_df = ppd_df.astype(
    {
        "postgroup": "category",
        "property_type": "category",
        "old_or_new": "category",
        "duration": "category",
        "price": "double",
    }
)
ppd_df = ppd_df.dropna()
ppd_df.sample(n=5)


In [None]:
import re
import string
from datetime import date

inflation_date_pattern = re.compile(r"([\d]{4})(?:\s+([\w]{3}))?")
inflation_month_names = [
    "JAN",
    "FEB",
    "MAR",
    "APR",
    "MAY",
    "JUN",
    "JUL",
    "AUG",
    "SEP",
    "OCT",
    "NOV",
    "DEC",
]
inflation_month_index = {mn: ix + 1 for (ix, mn) in enumerate(inflation_month_names)}
inflation_month_index["Q1"] = 1
inflation_month_index["Q2"] = 4
inflation_month_index["Q3"] = 7
inflation_month_index["Q3"] = 10

inflation_acceptable_numeric_chars = string.digits + ".,"


def extract_inflation_date(x: str) -> date:
    match = next(inflation_date_pattern.finditer(x), None)
    if match:
        group_count = len(match.groups())
        if group_count >= 1:
            year = int(match.group(1))
            month = 1
            month_name = match.group(2)
            if group_count > 1 and month_name:
                month_name = month_name.strip().upper()
                month = inflation_month_index.get(month_name)
            return date(year, month, 1)


def extract_inflation_rate(x: str) -> float:
    x = str(x)
    if all([c in inflation_acceptable_numeric_chars for c in x]):
        return float(x)
    return None


inflation_df = pd.read_csv(inflation_filepath)
inflation_df["date"] = inflation_df["Title"].map(extract_inflation_date)
inflation_df["date"] = pd.to_datetime(inflation_df["date"])
inflation_df["rate"] = inflation_df["CPIH ANNUAL RATE 00: ALL ITEMS 2015=100"].map(
    extract_inflation_rate
)
inflation_df["rate"] = inflation_df["rate"].astype("float", errors="ignore")
inflation_df = inflation_df[["date", "rate"]]
inflation_df = inflation_df.dropna()
inflation_df = inflation_df.set_index("date").sort_index()
inflation_df.sample(n=5)


In [None]:
interest_df = pd.read_csv(interest_filepath)
interest_df["date"] = pd.to_datetime(interest_df["Date Changed"])
interest_df["rate"] = interest_df["Rate"].astype("float")
interest_df = interest_df[["date", "rate"]]
interest_df = interest_df.set_index("date").sort_index()
interest_df.sample(n=5)


In [None]:
from tqdm import tqdm, trange
from typing import Callable
from datetime import date, timedelta

tqdm.pandas()


def build_rate_extractor(df: pd.DataFrame) -> Callable[[date], float]:
    min_date = df.index.min()
    max_date = df.index.max()
    cur_date = min_date
    rate_index = {}
    first_rate = df.rate[0]
    prev_rate = first_rate
    last_rate = df.rate[-1]
    with trange((max_date - min_date).days, desc="rate_index") as pbar:
        while cur_date <= max_date:
            rates = df[df.index == cur_date].rate
            if rates.any():
                new_rate = rates[0] / 100.0
                rate_index[cur_date] = new_rate
                prev_rate = new_rate
            else:
                rate_index[cur_date] = prev_rate
            cur_date += timedelta(days=1)
            pbar.update()

    def get_rate_for_date(d: date) -> float:
        if d < min_date:
            return first_rate
        elif d > max_date:
            return last_rate
        else:
            return rate_index[d]

    return get_rate_for_date


df = ppd_df.copy()
df["inflation_rate"] = df.date.progress_map(build_rate_extractor(df=inflation_df))
df["interest_rate"] = df.date.progress_map(build_rate_extractor(df=interest_df))
df["date_year"] = df.date.progress_map(lambda d: d.year)
df["date_month"] = df.date.progress_map(lambda d: d.month)
df["date_day"] = df.date.progress_map(lambda d: d.day)
df["date_day_of_week"] = df.date.progress_map(lambda d: d.weekday())
df = df.sort_values(by="date").reset_index()
df = df[
    ["date_year", "date_month", "date_day", "date_day_of_week"]
    + list(ppd_df.columns[1:-1])
    + ["inflation_rate", "interest_rate", "price"]
]
df.sample(n=5)


In [None]:
df.to_csv(data_folderpath / "snapshot-Xy-1NF.zip", index=False)


## Statistical Summary


In [None]:
try:
    assert df is not None
except NameError:
    import pathlib
    import pandas as pd
    import numpy as np

    print("[SNAPSHOT] Reloading...")
    pd.set_option("display.float_format", lambda x: "{:,.3f}".format(x))
    data_folderpath = pathlib.Path("./data")
    df = pd.read_csv(data_folderpath / "snapshot-Xy-1NF.zip").astype(
        {
            "postgroup": "category",
            "property_type": "category",
            "old_or_new": "category",
            "duration": "category",
            "price": "double",
        }
    )
    print(f" - reloaded from snapshot, {df.shape[0]}")
df.sample(n=5)


In [None]:
df.info()


In [None]:
df.describe()


In [None]:
# TODO: Central Tendency


In [None]:
# TODO: Measures of Spread


In [None]:
# TODO: Type of distribution


## Data visualisation


In [None]:
try:
    assert df is not None
except NameError:
    import pathlib
    import pandas as pd

    print("[SNAPSHOT] Reloading...")
    pd.set_option("display.float_format", lambda x: "{:,.3f}".format(x))
    data_folderpath = pathlib.Path("./data")
    df = pd.read_csv(data_folderpath / "snapshot-Xy-1NF.zip").astype(
        {
            "postgroup": "category",
            "property_type": "category",
            "old_or_new": "category",
            "duration": "category",
            "price": "double",
        }
    )
    print(f" - reloaded from snapshot, {df.shape[0]}")
df.sample(n=5)


In [None]:
%matplotlib inline

In [None]:
import scipy
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter


In [None]:
from datetime import date

max_price = float(df.price.max())
max_rate = max(df.interest_rate.max(), df.inflation_rate.max())
min_intersecting_date = date(df.date_year.min(), 1, 1)
max_intersecting_date = date(df.date_year.max(), 12, 30)


In [None]:
_, axes = plt.subplots(ncols=2, nrows=1, figsize=(15, 5))


def plot_rate_distributions(ax, df: pd.DataFrame, label: str, color: str):
    df = df.copy()
    x = np.linspace(0.0, df["rate"].max(), 100)
    df["bin"] = pd.cut(df["rate"], bins=x)
    y = list(df.groupby("bin").count()["rate"])
    ax.fill_between(x[:-1], 0.0, y, color=color, alpha=0.5)
    ax.xaxis.set_major_formatter(FormatStrFormatter("%2.2f%%"))
    intervals = [0.05, 0.95]
    for interval, quantile in zip(intervals, df.rate.quantile(intervals)):
        percentile = f"P{int(interval*100.)}={round(quantile, 2)}"
        bbox = dict(boxstyle="round, pad=0.3", fc="lightgray", lw=2)
        ax.axvline(x=quantile, color="blue")
        ax.annotate(
            percentile,
            xy=(quantile, max(y)),
            bbox=bbox,
            ha="center",
            va="center",
        )
    ax.axvline(x=quantile, color="blue")
    ax.legend([label], loc="lower center", bbox_to_anchor=(0.5, -0.2))


plot_rate_distributions(
    ax=axes[0],
    df=interest_df,
    label="interest",
    color="green",
)

plot_rate_distributions(
    ax=axes[1],
    df=inflation_df,
    label="inflation",
    color="red",
)


In [None]:
from datetime import date
from tqdm import tqdm

_, axes = plt.subplots(nrows=2, figsize=(15, 10), sharex=True)

series = df.copy()
series["date"] = df.apply(lambda r: date(r.date_year, r.date_month, r.date_day), axis=1)
series = series.groupby("date").mean(numeric_only=True).dropna()

x = series.index

axes[0].grid(visible=True)
axes[0].plot(x, series.interest_rate * 100.0, "g.-", alpha=0.7)
axes[0].plot(x, series.inflation_rate * 100.0, "r.-", alpha=0.7)
axes[0].set_xlim(left=min_intersecting_date, right=max_intersecting_date)
axes[0].set_ylabel("rates (%)")
axes[0].yaxis.set_major_formatter(FormatStrFormatter("%2.2f%%"))
axes[0].legend(["interest", "inflation"])

axes[1].grid(visible=True)
axes[1].yaxis.set_major_formatter(StrMethodFormatter("{x:,}"))
axes[1].set_ylim(0.0, df.price.quantile(0.95) * 1.2)
axes[1].set_ylabel("property price (£)")
for ix, property_type in tqdm(list(enumerate(ppd_property_type.values()))):
    sub_series = df[df.property_type == property_type].copy()
    sub_series["date_ym"] = sub_series.apply(
        lambda r: date(r.date_year, r.date_month, 1), axis=1
    )
    sub_series = sub_series[["date_ym", "price"]]
    sub_series = sub_series.groupby("date_ym").mean(numeric_only=True)
    sub_series = sub_series.fillna(method="ffill")
    axes[1].plot(sub_series.index, sub_series.price, "s", alpha=0.7)
    axes[1].legend(ppd_property_type.values())


## Machine learning model


In [1]:
try:
    assert df is not None
except NameError:
    import pathlib
    import pandas as pd
    import numpy as np

    print("[SNAPSHOT] Reloading...")
    pd.set_option("display.float_format", lambda x: "{:,.3f}".format(x))
    data_folderpath = pathlib.Path("./data")
    df = pd.read_csv(data_folderpath / "snapshot-Xy-1NF.zip").astype(
        {
            "postgroup": "category",
            "property_type": "category",
            "old_or_new": "category",
            "duration": "category",
            "price": "double",
        }
    )
    print(f" - reloaded from snapshot, {df.shape[0]}")
df.sample(n=5)


[SNAPSHOT] Reloading...
 - reloaded from snapshot, 4336841


Unnamed: 0,date_year,date_month,date_day,date_day_of_week,postgroup,property_type,old_or_new,duration,inflation_rate,interest_rate,price
778875,2018,10,29,0,DY2,flat/maisonettes,new,leasehold,0.022,0.007,89950.0
4223940,2022,7,29,4,BD20,terraced,old,freehold,0.088,0.013,197500.0
2857487,2021,2,18,3,RG41,semi-detached,old,freehold,0.007,0.001,650000.0
694807,2018,9,28,4,DA11,terraced,new,freehold,0.022,0.007,380000.0
3093028,2021,4,22,3,CR2,flat/maisonettes,old,leasehold,0.016,0.001,216000.0


In [2]:
X, y = df[df.columns[:-1]], df[df.columns[-1]]


In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import make_column_transformer, make_column_selector


def make_sine_cycle_encoder(period: int = 1) -> float:
    assert period != 0
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


# https://scikit-learn.org/stable/modules/sgd.html#tips-on-practical-use
def make_df_column_transformer():
    categorical_selector = make_column_selector(dtype_include="category")
    float_seletor = make_column_selector(dtype_include="float64")
    one_hot = OneHotEncoder(sparse_output=True, handle_unknown="ignore")
    numerical_scaler = StandardScaler(with_mean=True, with_std=True)
    cycle_sine_12 = make_sine_cycle_encoder(period=12)
    cycle_sine_31 = make_sine_cycle_encoder(period=31)
    cycle_sine_6 = make_sine_cycle_encoder(period=6)
    return make_column_transformer(
        (one_hot, categorical_selector),
        (numerical_scaler, float_seletor),
        (numerical_scaler, ["date_year"]),
        (cycle_sine_12, ["date_month"]),
        (cycle_sine_31, ["date_day"]),
        (cycle_sine_6, ["date_day_of_week"]),
        remainder="drop",
        verbose=True,
    )


preprocessing_df_column_transformer = make_df_column_transformer()
X_encoded = preprocessing_df_column_transformer.fit_transform(X)
pd.DataFrame.sparse.from_spmatrix(X_encoded)


[ColumnTransformer] . (1 of 6) Processing onehotencoder, total=   4.2s
[ColumnTransformer]  (2 of 6) Processing standardscaler-1, total=   0.1s
[ColumnTransformer]  (3 of 6) Processing standardscaler-2, total=   0.1s
[ColumnTransformer]  (4 of 6) Processing functiontransformer-1, total=   0.1s
[ColumnTransformer]  (5 of 6) Processing functiontransformer-2, total=   0.1s
[ColumnTransformer]  (6 of 6) Processing functiontransformer-3, total=   0.1s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2306,2307,2308,2309,2310,2311,2312,2313,2314,2315
0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,1.000,0.000,0.089,0.039,-1.361,0.500,0.201,0.000
1,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,1.000,0.000,0.089,0.039,-1.361,0.500,0.201,0.000
2,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,0.000,1.000,0.089,0.039,-1.361,0.500,0.201,0.000
3,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,1.000,0.000,0.089,0.039,-1.361,0.500,0.201,0.000
4,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,1.000,0.000,0.089,0.039,-1.361,0.500,0.201,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4336836,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,0.000,1.000,3.504,4.581,1.617,-0.866,-0.938,0.866
4336837,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,1.000,0.000,3.504,4.581,1.617,-0.866,-0.849,0.866
4336838,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,1.000,0.000,3.504,4.581,1.617,-0.866,-0.849,0.866
4336839,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,1.000,0.000,3.504,4.581,1.617,-0.866,-0.571,-0.866


In [4]:
from typing import Tuple
from sklearn.model_selection import train_test_split


def produce_split_summary(
    X_split: pd.DataFrame, y_split: pd.DataFrame, name: str, total: int
) -> Tuple[str, int, int, str]:
    return (
        name,
        X_split.shape[0],
        y_split.shape[0],
        "{:.1f}%".format(100.0 * X_split.shape[0] / total),
    )


r = 42
train_size = 0.99
X1, X2, y1, y2 = train_test_split(X_encoded, y, train_size=train_size, random_state=r)
pd.DataFrame(
    [
        produce_split_summary(X, y, "full", total=X.shape[0]),
        produce_split_summary(X1, y1, "train", total=X.shape[0]),
        produce_split_summary(X2, y2, "test", total=X.shape[0]),
    ],
    columns=["split", "|X|", "|y|", "%"],
)


Unnamed: 0,split,|X|,|y|,%
0,full,4336841,4336841,100.0%
1,train,4293472,4293472,99.0%
2,test,43369,43369,1.0%


In [5]:
from typing import Tuple
from sklearn.neural_network import MLPRegressor


def make_model(hidden_layer_sizes: Tuple[int, ...], max_iter: int):
    r = 42
    return MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        validation_fraction=0.01,
        random_state=r,
        verbose=True,
        max_iter=max_iter,
    )


model = make_model(hidden_layer_sizes=(2,), max_iter=1)
model.fit(X1, y1)
pd.DataFrame(
    [
        ("train", model.score(X1, y1)),
        ("test", model.score(X2, y2)),
    ],
    columns=["spit", "score"],
)


Iteration 1, loss = 129946195645.34506226




Unnamed: 0,spit,score
0,train,-0.624
1,test,-0.502


In [6]:
import itertools
import numpy as np
from typing import Set, List, Tuple
from sklearn.model_selection import GridSearchCV


grid = GridSearchCV(
    model,
    param_grid={
        "hidden_layer_sizes": [
            (2, 4),
            (3, 4),
            (4, 4),
            (8, 4),
            (16, 4),
        ],
        "max_iter": [10],
    },
    cv=5,
    verbose=3,
)
grid


In [7]:
grid.fit(X1, y1)
grid.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Iteration 1, loss = 116537180839.55850220
Iteration 2, loss = 80761463472.32351685
Iteration 3, loss = 70791052660.83818054
Iteration 4, loss = 68091691832.29521942
Iteration 5, loss = 66269109792.86598206
Iteration 6, loss = 64521804256.27986908
Iteration 7, loss = 62535351791.22803497
Iteration 8, loss = 60415182141.98910522
Iteration 9, loss = 58493688581.13388062
Iteration 10, loss = 57002419489.90628815




[CV 1/5] END hidden_layer_sizes=(2, 4), max_iter=10;, score=0.171 total time= 4.1min
Iteration 1, loss = 125005543486.08050537
Iteration 2, loss = 89233705548.53434753
Iteration 3, loss = 79265643326.22076416
Iteration 4, loss = 76582553050.11213684
Iteration 5, loss = 74767481606.29716492
Iteration 6, loss = 73028843746.78851318
Iteration 7, loss = 71136353179.88880920
Iteration 8, loss = 69085334782.11497498
Iteration 9, loss = 67161275285.62870026
Iteration 10, loss = 65657288485.71463013




[CV 2/5] END hidden_layer_sizes=(2, 4), max_iter=10;, score=0.252 total time= 4.2min
Iteration 1, loss = 120518233842.34460449
Iteration 2, loss = 83376781659.88890076
Iteration 3, loss = 76663916746.66841125
Iteration 4, loss = 74120578816.20677185
Iteration 5, loss = 72175283635.11212158
Iteration 6, loss = 70046028560.13327026
Iteration 7, loss = 67725309326.87785339
Iteration 8, loss = 65537748301.86626434
Iteration 9, loss = 63890187196.48462677
Iteration 10, loss = 62687024557.79617310




[CV 3/5] END hidden_layer_sizes=(2, 4), max_iter=10;, score=0.248 total time= 3.7min
Iteration 1, loss = 121972747380.89706421
Iteration 2, loss = 86170308864.48503113
Iteration 3, loss = 76223503924.85212708
Iteration 4, loss = 73523211081.89280701
Iteration 5, loss = 71695915565.73443604
Iteration 6, loss = 69967450919.52796936
Iteration 7, loss = 68047239680.02334595
Iteration 8, loss = 65983079531.76647949
Iteration 9, loss = 64080494327.50776672
Iteration 10, loss = 62542308862.95951080




[CV 4/5] END hidden_layer_sizes=(2, 4), max_iter=10;, score=0.215 total time= 4.0min
Iteration 1, loss = 124658197079.83718872
Iteration 2, loss = 88868809941.84579468
Iteration 3, loss = 78885075741.59284973
Iteration 4, loss = 76195958127.33645630
Iteration 5, loss = 74364762196.95271301
Iteration 6, loss = 72631171237.25038147
Iteration 7, loss = 70718272277.78771973
Iteration 8, loss = 68662827436.80850983
Iteration 9, loss = 66759917275.97394562
Iteration 10, loss = 65211745557.88530731




[CV 5/5] END hidden_layer_sizes=(2, 4), max_iter=10;, score=0.244 total time= 4.8min
Iteration 1, loss = 124463702206.09841919
Iteration 2, loss = 124458260175.30221558
Iteration 3, loss = 124452821123.57171631
Iteration 4, loss = 124447377793.32217407
Iteration 5, loss = 124441939211.02917480
Iteration 6, loss = 124436495851.37773132
Iteration 7, loss = 124431057705.27668762
Iteration 8, loss = 124425617383.04949951
Iteration 9, loss = 124420179124.62178040
Iteration 10, loss = 124414742724.58718872




[CV 1/5] END hidden_layer_sizes=(3, 4), max_iter=10;, score=-0.493 total time= 5.2min
Iteration 1, loss = 132920870249.06993103
Iteration 2, loss = 132915428098.22882080
Iteration 3, loss = 132909989261.43649292
Iteration 4, loss = 132904550487.79533386
Iteration 5, loss = 132899110552.75694275
Iteration 6, loss = 132893669312.91693115
Iteration 7, loss = 132888230449.66456604
Iteration 8, loss = 132882791436.22843933
Iteration 9, loss = 132877352864.45970154
Iteration 10, loss = 132871917713.52175903




[CV 2/5] END hidden_layer_sizes=(3, 4), max_iter=10;, score=-0.732 total time= 5.3min
Iteration 1, loss = 131461621485.24711609
Iteration 2, loss = 131456176127.81387329
Iteration 3, loss = 131450735396.56501770
Iteration 4, loss = 131445290564.91346741
Iteration 5, loss = 131439854090.45745850
Iteration 6, loss = 131434411807.97700500
Iteration 7, loss = 131428972488.05035400
Iteration 8, loss = 131423533402.34310913
Iteration 9, loss = 131418094476.56262207
Iteration 10, loss = 131412654656.59336853




[CV 3/5] END hidden_layer_sizes=(3, 4), max_iter=10;, score=-0.674 total time= 5.0min
Iteration 1, loss = 129888405818.29980469
Iteration 2, loss = 129882962545.93354797
Iteration 3, loss = 129877518492.77578735
Iteration 4, loss = 129872079210.16864014
Iteration 5, loss = 129866640789.00119019
Iteration 6, loss = 129861195104.50701904
Iteration 7, loss = 129855757576.08647156
Iteration 8, loss = 129850318843.74612427
Iteration 9, loss = 129844879998.44837952
Iteration 10, loss = 129839439976.73170471




[CV 4/5] END hidden_layer_sizes=(3, 4), max_iter=10;, score=-0.622 total time= 5.0min
Iteration 1, loss = 132575864441.62911987
Iteration 2, loss = 132570422561.27354431
Iteration 3, loss = 132564979209.41978455
Iteration 4, loss = 132559538787.72904968
Iteration 5, loss = 132554099352.30273438
Iteration 6, loss = 132548652667.76820374
Iteration 7, loss = 132543216649.36468506
Iteration 8, loss = 132537778915.17199707
Iteration 9, loss = 132532338027.40228271
Iteration 10, loss = 132526898129.63342285




[CV 5/5] END hidden_layer_sizes=(3, 4), max_iter=10;, score=-0.716 total time= 4.6min
Iteration 1, loss = 110743642704.62802124
Iteration 2, loss = 74116297715.65849304
Iteration 3, loss = 68906782502.18496704
Iteration 4, loss = 66421603532.18337250
Iteration 5, loss = 64238656834.79212952
Iteration 6, loss = 61720864806.35443115
Iteration 7, loss = 59139491973.70907593
Iteration 8, loss = 57131849110.26185608
Iteration 9, loss = 55761450853.98509979
Iteration 10, loss = 54757222337.98211670




[CV 1/5] END hidden_layer_sizes=(4, 4), max_iter=10;, score=0.191 total time= 5.1min
Iteration 1, loss = 119219700329.54650879
Iteration 2, loss = 82579010413.31042480
Iteration 3, loss = 77379751218.52574158
Iteration 4, loss = 74911791685.62098694
Iteration 5, loss = 72747308650.82095337
Iteration 6, loss = 70331298168.88389587
Iteration 7, loss = 67779537798.90168762
Iteration 8, loss = 65741905034.10162354
Iteration 9, loss = 64349626688.91780090
Iteration 10, loss = 63338445415.79711151




[CV 2/5] END hidden_layer_sizes=(4, 4), max_iter=10;, score=0.283 total time= 4.6min
Iteration 1, loss = 117765424383.18458557
Iteration 2, loss = 81135448235.96640015
Iteration 3, loss = 75870469067.11126709
Iteration 4, loss = 73418336134.59747314
Iteration 5, loss = 71260107433.36206055
Iteration 6, loss = 68794383911.71038818
Iteration 7, loss = 66257382505.33696747
Iteration 8, loss = 64242961603.10024261
Iteration 9, loss = 62847969752.24678802
Iteration 10, loss = 61829158460.25091553




[CV 3/5] END hidden_layer_sizes=(4, 4), max_iter=10;, score=0.259 total time= 4.8min
Iteration 1, loss = 116185750374.59788513
Iteration 2, loss = 79555080967.76846313
Iteration 3, loss = 74286987746.56005859
Iteration 4, loss = 71841681673.44865417
Iteration 5, loss = 69667529441.50236511
Iteration 6, loss = 67172679645.66815948
Iteration 7, loss = 64628744456.39997864
Iteration 8, loss = 62624266829.08331299
Iteration 9, loss = 61236484160.47196198
Iteration 10, loss = 60235633926.56857300




[CV 4/5] END hidden_layer_sizes=(4, 4), max_iter=10;, score=0.241 total time= 4.9min
Iteration 1, loss = 118864895656.46527100
Iteration 2, loss = 82219946906.96893311
Iteration 3, loss = 76975572883.48075867
Iteration 4, loss = 74521417883.04370117
Iteration 5, loss = 72357166550.47789001
Iteration 6, loss = 69889677547.77111816
Iteration 7, loss = 67359273445.29254913
Iteration 8, loss = 65330705022.95080566
Iteration 9, loss = 63911413360.50464630
Iteration 10, loss = 62888322653.67573547




[CV 5/5] END hidden_layer_sizes=(4, 4), max_iter=10;, score=0.273 total time= 5.1min
Iteration 1, loss = 96189702225.78398132
Iteration 2, loss = 69034532208.59458923
Iteration 3, loss = 65058952828.18436432
Iteration 4, loss = 61055974707.79560089
Iteration 5, loss = 57357424741.07167053
Iteration 6, loss = 55052209851.67828369
Iteration 7, loss = 53534305860.08687592
Iteration 8, loss = 52267192574.14588928
Iteration 9, loss = 51149396203.24585724
Iteration 10, loss = 50160304454.67521667




[CV 1/5] END hidden_layer_sizes=(8, 4), max_iter=10;, score=0.237 total time= 3.4min
Iteration 1, loss = 104617073648.94381714
Iteration 2, loss = 77487864771.79450989
Iteration 3, loss = 73556226953.38354492
Iteration 4, loss = 69720127521.24243164
Iteration 5, loss = 66054266501.49440765
Iteration 6, loss = 63696020792.25295258
Iteration 7, loss = 62146981798.38996124
Iteration 8, loss = 60881593270.62611389
Iteration 9, loss = 59764231692.39842224
Iteration 10, loss = 58778091184.04358673




[CV 2/5] END hidden_layer_sizes=(8, 4), max_iter=10;, score=0.349 total time= 3.6min
Iteration 1, loss = 103895534665.36614990
Iteration 2, loss = 76030783767.17703247
Iteration 3, loss = 71988977990.36811829
Iteration 4, loss = 68039050762.74380493
Iteration 5, loss = 64364698936.86511993
Iteration 6, loss = 62004226320.94968414
Iteration 7, loss = 60439025235.57301331
Iteration 8, loss = 59201159267.25621796
Iteration 9, loss = 58139612814.93318176
Iteration 10, loss = 57198054618.85047913




[CV 3/5] END hidden_layer_sizes=(8, 4), max_iter=10;, score=0.320 total time= 3.8min
Iteration 1, loss = 101935453352.57774353
Iteration 2, loss = 74444987528.47241211
Iteration 3, loss = 70434401875.80992126
Iteration 4, loss = 66528563391.86125183
Iteration 5, loss = 62870082105.30548096
Iteration 6, loss = 60524194518.77929688
Iteration 7, loss = 58986221730.28992462
Iteration 8, loss = 57723366386.48702240
Iteration 9, loss = 56628420912.54097748
Iteration 10, loss = 55639630930.73387909




[CV 4/5] END hidden_layer_sizes=(8, 4), max_iter=10;, score=0.300 total time= 3.2min
Iteration 1, loss = 104964365183.41217041
Iteration 2, loss = 77137442374.20715332
Iteration 3, loss = 73077286975.47445679
Iteration 4, loss = 69157497630.60223389
Iteration 5, loss = 65488236800.45494843
Iteration 6, loss = 63131929385.33524323
Iteration 7, loss = 61580699579.54413605
Iteration 8, loss = 60318421145.94735718
Iteration 9, loss = 59235037742.27500153
Iteration 10, loss = 58255579258.02764130




[CV 5/5] END hidden_layer_sizes=(8, 4), max_iter=10;, score=0.338 total time= 3.3min
Iteration 1, loss = 89565841996.40547180
Iteration 2, loss = 66760448560.47205353
Iteration 3, loss = 61421580644.89160156
Iteration 4, loss = 56915835106.87313080
Iteration 5, loss = 54540059030.42611694
Iteration 6, loss = 53123790982.31690216
Iteration 7, loss = 52111782577.83787537
Iteration 8, loss = 51306575083.45102692
Iteration 9, loss = 50616694805.40952301
Iteration 10, loss = 50023428150.77447510




[CV 1/5] END hidden_layer_sizes=(16, 4), max_iter=10;, score=0.236 total time= 6.2min
Iteration 1, loss = 98039038614.86181641
Iteration 2, loss = 75640122603.46699524
Iteration 3, loss = 70939471472.11451721
Iteration 4, loss = 66139741524.91505432
Iteration 5, loss = 63347772647.24191284
Iteration 6, loss = 61453023557.42084503
Iteration 7, loss = 59956842351.54866791
Iteration 8, loss = 58685760496.39670563
Iteration 9, loss = 57542547167.09015656
Iteration 10, loss = 56545824028.95244598




[CV 2/5] END hidden_layer_sizes=(16, 4), max_iter=10;, score=0.381 total time= 8.3min
Iteration 1, loss = 96627216984.85125732
Iteration 2, loss = 74072354236.07437134
Iteration 3, loss = 69355338820.26487732
Iteration 4, loss = 64566336243.85940552
Iteration 5, loss = 61675741623.24427795
Iteration 6, loss = 59744069094.01660156
Iteration 7, loss = 58251692828.15032959
Iteration 8, loss = 56949693531.68425751
Iteration 9, loss = 55806918695.91770172
Iteration 10, loss = 54793049970.46424866




[CV 3/5] END hidden_layer_sizes=(16, 4), max_iter=10;, score=0.352 total time= 8.9min
Iteration 1, loss = 94999581493.46548462
Iteration 2, loss = 72233693191.96241760
Iteration 3, loss = 66966556841.18807983
Iteration 4, loss = 62301910656.25228119
Iteration 5, loss = 59849715573.36222076
Iteration 6, loss = 58390533864.74530029
Iteration 7, loss = 57358555219.80556488
Iteration 8, loss = 56536572455.99349976
Iteration 9, loss = 55838890733.20046997
Iteration 10, loss = 55237197851.50032043




[CV 4/5] END hidden_layer_sizes=(16, 4), max_iter=10;, score=0.303 total time= 8.1min
Iteration 1, loss = 100428753726.66613770
Iteration 2, loss = 76061905839.78451538
Iteration 3, loss = 71806184634.91215515
Iteration 4, loss = 67172246965.26683807
Iteration 5, loss = 63861274387.23213196
Iteration 6, loss = 61892991455.87061310
Iteration 7, loss = 60410152855.67568207
Iteration 8, loss = 59171409364.49687195
Iteration 9, loss = 58080530253.21395111
Iteration 10, loss = 57095185745.59258270




[CV 5/5] END hidden_layer_sizes=(16, 4), max_iter=10;, score=0.354 total time= 8.5min
Iteration 1, loss = 91237902208.67666626
Iteration 2, loss = 71149984305.91304016
Iteration 3, loss = 65090701497.04032898
Iteration 4, loss = 60993124706.30133057
Iteration 5, loss = 58670734145.44852448
Iteration 6, loss = 56887276129.87571716
Iteration 7, loss = 55397542715.28436279
Iteration 8, loss = 54123282988.17343903
Iteration 9, loss = 53003795017.81695557
Iteration 10, loss = 52040105936.75706482




{'hidden_layer_sizes': (16, 4), 'max_iter': 10}

In [None]:
model = make_model(
    hidden_layer_sizes=grid.best_params_["hidden_layer_sizes"],
    max_iter=200,
)
model


In [None]:
model.fit(X1, y1)


In [None]:
model.score(X2, y2)


# III. Conclusions


## Performance of results


## Closing remarks/statements
