In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

In [4]:
import logging

import polars as pl
from matplotlib import pyplot as plt

from playground.logs import setup_logging

In [5]:
setup_logging()
logger = logging.getLogger(__name__)

# Load data

In [6]:
TRAIN_FILE_PATH = Path.cwd() / "train.csv"
assert TRAIN_FILE_PATH.exists()
assert TRAIN_FILE_PATH.is_file()

In [9]:
train_df = pl.read_csv(TRAIN_FILE_PATH)
train_df

id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
i64,f64,f64,f64,f64,f64,f64,f64,i64,f64
0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19
5,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43
6,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1
7,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94
8,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14
9,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94


In [8]:
TEST_FILE_PATH = Path.cwd() / "test.csv"
assert TEST_FILE_PATH.exists()
assert TEST_FILE_PATH.is_file()

In [10]:
test_df = pl.read_csv(TEST_FILE_PATH)
test_df

id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays
i64,f64,f64,f64,f64,f64,f64,f64,i64
5407,166.1,75.4,163.8,173.8,4.6,1007.2,746.6,56
5408,304.0,0.0,0.0,190.0,0.0,998.0,801.0,7
5409,225.0,0.0,0.0,185.0,0.0,1113.0,833.0,28
5410,251.4,0.0,118.3,188.5,6.4,1028.4,757.7,100
5411,144.0,15.0,195.0,176.0,6.0,1021.0,709.0,28
5412,261.9,111.0,87.5,195.5,5.9,832.6,790.4,28
5413,150.0,0.0,0.0,174.0,2.5,1069.0,676.0,28
5414,272.8,181.9,0.0,185.7,0.0,1012.4,714.3,7
5415,318.8,212.5,0.0,155.7,5.5,852.1,859.2,28
5416,102.0,147.2,0.0,192.0,0.0,917.8,829.5,90


# Feature transformation

In [11]:
ID_COLUMN = 'id'
FEATURE_COLUMNS = [
    "CementComponent",
    "BlastFurnaceSlag",
    "FlyAshComponent",
    "WaterComponent",
    "SuperplasticizerComponent",
    "CoarseAggregateComponent",
    "FineAggregateComponent",
    "AgeInDays",
]
TARGET_COLUMN = "Strength"


In [16]:
means = dict(
    zip(
        FEATURE_COLUMNS,
        train_df.select([pl.col(col).mean() for col in FEATURE_COLUMNS]).row(0),
    )
)
means


{'CementComponent': 299.16818938413167,
 'BlastFurnaceSlag': 58.610578879230644,
 'FlyAshComponent': 31.87279452561494,
 'WaterComponent': 185.0762345108192,
 'SuperplasticizerComponent': 4.10844090993157,
 'CoarseAggregateComponent': 992.0007175883115,
 'FineAggregateComponent': 771.2199741076375,
 'AgeInDays': 51.75161827260958}

In [17]:
stds = dict(
    zip(
        FEATURE_COLUMNS,
        train_df.select([pl.col(col).std() for col in FEATURE_COLUMNS]).row(0),
    )
)
stds

{'CementComponent': 105.53768168118059,
 'BlastFurnaceSlag': 83.41780074487346,
 'FlyAshComponent': 54.60500297587532,
 'WaterComponent': 18.517583489378303,
 'SuperplasticizerComponent': 5.692296206281762,
 'CoarseAggregateComponent': 77.14801045239005,
 'FineAggregateComponent': 78.72525270217871,
 'AgeInDays': 70.00697526599377}

In [18]:
def transform(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns([
        (pl.col(col) - means[col]) / stds[col]
        for col in FEATURE_COLUMNS
    ])

In [19]:
train_transformed_df = transform(train_df)
train_transformed_df

id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,2.139822,-0.702615,-0.583697,0.049886,-0.721755,1.72395,-2.009774,-0.696382,10.38
1,-1.479739,1.323332,2.03511,0.319899,0.683654,-0.324062,-1.628702,-0.339275,23.52
2,-0.096347,0.912148,-0.583697,0.033685,-0.721755,1.075845,0.305874,-0.339275,36.96
3,0.045783,0.208462,-0.583697,2.318,-0.721755,-0.777735,-1.285737,4.474531,39.05
4,-1.347085,2.126518,-0.583697,0.373902,-0.721755,-0.733664,0.12677,0.546351,74.19
5,0.481646,-0.702615,-0.583697,0.967932,-0.721755,0.816603,0.048015,-0.639245,37.43
6,-1.548908,1.736913,-0.583697,0.033685,-0.721755,1.091399,-0.151412,-0.339275,35.1
7,0.315829,1.005654,-0.583697,2.318,-0.721755,-0.777735,-2.25112,-0.339275,45.94
8,0.216338,-0.702615,-0.583697,0.967932,-0.721755,-0.233327,0.365576,1.831937,42.14
9,-1.574492,1.694955,-0.583697,0.373902,-0.721755,-0.837361,0.86351,-0.696382,6.94


In [20]:
test_transformed_df = transform(test_df)
test_transformed_df

id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays
i64,f64,f64,f64,f64,f64,f64,f64,f64
5407,-1.26086,0.201269,2.416028,-0.608947,0.086355,0.197015,-0.312733,0.060685
5408,0.045783,-0.702615,-0.583697,0.265897,-0.721755,0.077763,0.378278,-0.639245
5409,-0.702765,-0.702615,-0.583697,-0.004117,-0.721755,1.568404,0.784755,-0.339275
5410,-0.452617,-0.702615,1.582771,0.184893,0.402572,0.471811,-0.171736,0.689194
5411,-1.470263,-0.522797,2.987404,-0.490141,0.332302,0.375892,-0.790343,-0.339275
5412,-0.353127,0.628036,1.01872,0.562912,0.314734,-2.066168,0.243632,-0.339275
5413,-1.413412,-0.702615,-0.583697,-0.598147,-0.282565,0.998072,-1.209523,-0.339275
5414,-0.249846,1.477975,-0.583697,0.033685,-0.721755,0.264417,-0.723021,-0.639245
5415,0.186017,1.844803,-0.583697,-1.586397,0.244464,-1.813407,1.117558,-0.339275
5416,-1.868226,1.061997,-0.583697,0.373902,-0.721755,-0.961797,0.740296,0.546351
