In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import common

In [None]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True, as_frame=True, scaled=True)
X.columns = ['age', 'sex', 'bmi', 'bp', 'tc', 'ldl', 'hdl', 'tch', 'ltg', 'glu']
display(X.shape)
X.head()

In [None]:
X.describe()

In [None]:
data = pd.concat([X,y], axis=1)

In [None]:
sns.pairplot(X);

In [None]:
# Plot the target distribution
sns.displot(y, kde=True, height=4, aspect=2)
plt.title("Target distribution");

In [None]:
X.isna().any()

In [None]:
cmap_corr = sns.diverging_palette(20, 220, n=200)
sns.heatmap(pd.concat([X,y], axis=1).corr(), cmap=cmap_corr, vmin=-1, vmax=1, annot=True, fmt='.2f');

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.2, random_state=common.RANDOM_STATE)
display(data_train.shape, data_test.shape)

In [None]:
import sqlite3
import os

db_dir = os.path.dirname(common.DB_PATH)
if not os.path.exists(db_dir):
    os.makedirs(db_dir)

print(f"Saving train and test data to a database: {common.DB_PATH}")
with sqlite3.connect(common.DB_PATH) as con:
    # cur = con.cursor()
    # cur.execute("DROP TABLE IF EXISTS train")
    # cur.execute("DROP TABLE IF EXISTS test")
    data_train.to_sql(name='train', con=con, if_exists="replace")
    data_test.to_sql(name='test', con=con, if_exists="replace")

In [None]:
print(f"Reading train data from the database: {common.DB_PATH}")
with sqlite3.connect(common.DB_PATH) as con:
    cur = con.cursor()
    res = cur.execute("SELECT * FROM train LIMIT 3")
    display(res.fetchall())