# Regression template

This follows https://auto.gluon.ai/stable/tutorials/tabular/tabular-quick-start.html

Work through the notebook cells and change to make it work for your project


### Imports

In [None]:
import pandas as pd
import numpy as np
from psmiles import PolymerSmiles as PS
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor

### Data and fingerprint

- Replace the following code and load your own data

In [None]:
df_init = pd.read_json(
    "https://raw.githubusercontent.com/kuennethgroup/materials_datasets/refs/heads/main/polymer_tendency_to_crystalize/polymers_tend_to_crystalize.json"
)[["smiles", "value"]]

# Compute the fingerprints using the PSMILES package
fps = np.vstack(df_init.smiles.apply(lambda x: PS(x).fingerprint()))

### Scale the fingerprints

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler_fps = MinMaxScaler()
fps_scaled = scaler_fps.fit_transform(fps)
fps_scaled = pd.DataFrame(fps_scaled)

### Prepare final data frame

In [None]:
# Concat fingerprints
df = pd.concat((fps_scaled, df_init), axis=1)

# drop smiles column because it should not be used for training
df = df.drop(columns="smiles")

# Make sure they're all float
df = df.astype(np.float32)


# Remove columns that are zero, if any
df = df.loc[:, (df != 0).any(axis=0)]

# Normalize the tendency to crystalize
# df['value'] = df['value'] / 100

scaler_value = MinMaxScaler()
df["value"] = scaler_value.fit_transform(df[["value"]])
df

### Split in train and test 

In [None]:
df_train, df_test = train_test_split(df, test_size=0.20, random_state=42)

display(df_train)
display(df_test)

### Train you AutoGluon ML model

In [None]:
predictor = TabularPredictor(
    label="value",
    problem_type="regression",
).fit(df_train, time_limit=60, presets="best_quality")

### Use matplotlib for plotting

In [None]:
from sklearn.metrics import r2_score, root_mean_squared_error

y_pred = predictor.predict(df_test.drop(columns=["value"]))

r2 = r2_score(df_test["value"], y_pred)
rmse = root_mean_squared_error(df_test["value"], y_pred)

print(f"R2: {r2}")
print(f"RMSE: {rmse}")

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

y_pred = predictor.predict(df_test.drop(columns=["value"]))
y_pred

ax.plot(y_pred, df_test["value"], "o")
ax.plot([0, 1], [0, 1], "k--")
ax.text(0.1, 0.9, f"R2 = {r2:.3f}", transform=ax.transAxes)
ax.text(0.1, 0.85, f"RMSE = {rmse:.3f}", transform=ax.transAxes)
ax.set_title("Testdata set")
ax.set_ylabel("truth")
ax.set_xlabel("pred")