In [8]:
import pandas as pd

df = pd.read_csv("../data/raw/part01/Data.csv")
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [9]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     str    
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     str    
dtypes: float64(2), str(2)
memory usage: 452.0 bytes


In [10]:
df.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
df[["Age", "Salary"]] = imputer.fit_transform(df[["Age", "Salary"]])

df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

X = df.drop("Purchased", axis=1)
y = df["Purchased"]

ct = ColumnTransformer(
    transformers=[
        ("country", OneHotEncoder(drop="first"), ["Country"])
    ],
    remainder="passthrough"
)

X_encoded = ct.fit_transform(X)
X_encoded

array([[0.00000000e+00, 0.00000000e+00, 4.40000000e+01, 7.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 2.70000000e+01, 4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 3.00000000e+01, 5.40000000e+04],
       [0.00000000e+00, 1.00000000e+00, 3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 4.00000000e+01, 6.37777778e+04],
       [0.00000000e+00, 0.00000000e+00, 3.50000000e+01, 5.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 3.87777778e+01, 5.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 4.80000000e+01, 7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 5.00000000e+01, 8.30000000e+04],
       [0.00000000e+00, 0.00000000e+00, 3.70000000e+01, 6.70000000e+04]])

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [14]:
X_encoded.shape, y_encoded.shape

((10, 4), (10,))

In [15]:
from sklearn.model_selection import train_test_split

X = df.drop("Purchased", axis=1)
y = df["Purchased"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((8, 3), (2, 3))

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

numeric_features = ["Age", "Salary"]
categorical_features = ["Country"]

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(drop="first")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

X_train_prepared.shape, X_test_prepared

((8, 4),
 array([[-0.02899236, -1.12391193,  0.        ,  1.        ],
        [-0.26093123,  0.18833119,  0.        ,  0.        ]]))

In [18]:
X_test_prepared.shape

(2, 4)

In [19]:
from sklearn.linear_model import LinearRegression, LogisticRegression

clf = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LogisticRegression()),
    ]
)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5

In [22]:
clf.named_steps["model"].coef_, clf.named_steps["model"].intercept_


(array([[-0.11365409, -0.24988368, -0.3341416 , -0.15974442]]),
 array([0.16318801]))

In [23]:
import numpy as np

pre = clf.named_steps["preprocess"]
model = clf.named_steps["model"]

# Feature names (order matters!)
feat = pre.get_feature_names_out()

w = model.coef_.ravel()
b = float(model.intercept_[0])

feat, w, b


(array(['num__Age', 'num__Salary', 'cat__Country_Germany',
        'cat__Country_Spain'], dtype=object),
 array([-0.11365409, -0.24988368, -0.3341416 , -0.15974442]),
 0.1631880094644399)

### auto-generate a tiny “inference-only” Python function like code
**Step 1** — Extract the constants from the trained `clf`

In [24]:
import numpy as np

pre = clf.named_steps["preprocess"]
model = clf.named_steps["model"]

# Confirm feature order
feature_names = pre.get_feature_names_out().tolist()

# Logistic regression params
w = model.coef_.ravel().tolist()
b = float(model.intercept_[0])

# Numeric scaler stats (the ones you MUST embed)
num_pipe = pre.named_transformers_["num"]
scaler = num_pipe.named_steps["scaler"]
age_mean, salary_mean = scaler.mean_.tolist()
age_std,  salary_std  = scaler.scale_.tolist()

feature_names, w, b, (age_mean, age_std, salary_mean, salary_std)


(['num__Age', 'num__Salary', 'cat__Country_Germany', 'cat__Country_Spain'],
 [-0.11365409190612158,
  -0.2498836760388902,
  -0.3341415989902063,
  -0.1597444172497891],
 0.1631880094644399,
 (39.0, 7.664854858377946, 64847.22222222222, 11430.808691675275))

**Step 2** — Auto-generate an “embedded-style” Python predictor (no sklearn)

In [25]:
# ---- Autogenerated constants (from your trained pipeline) ----
W = np.array(w, dtype=float)
B = float(b)

AGE_MEAN = float(age_mean)
AGE_STD  = float(age_std)
SALARY_MEAN = float(salary_mean)
SALARY_STD  = float(salary_std)

def sigmoid(x: float) -> float:
    # numerically stable-ish sigmoid
    if x >= 0:
        z = np.exp(-x)
        return 1.0 / (1.0 + z)
    else:
        z = np.exp(x)
        return z / (1.0 + z)

def predict_from_csv_line(line: str) -> str:
    """
    Input format: 'Spain,27.0,48000.0'
    Output: 'Yes' or 'No'
    """
    country, age_s, salary_s = [p.strip() for p in line.split(",")]
    age = float(age_s)
    salary = float(salary_s)

    # One-hot with drop='first' (Germany, Spain). France => 0,0
    is_germany = 1.0 if country == "Germany" else 0.0
    is_spain   = 1.0 if country == "Spain" else 0.0

    # StandardScaler
    age_scaled    = (age - AGE_MEAN) / AGE_STD
    salary_scaled = (salary - SALARY_MEAN) / SALARY_STD

    # Feature vector in the SAME order as feature_names
    x = np.array([age_scaled, salary_scaled, is_germany, is_spain], dtype=float)

    score = B + float(np.dot(W, x))
    p = sigmoid(score)

    return "Yes" if p >= 0.5 else "No"

# Quick test
predict_from_csv_line("Spain,27.0,48000.0")


'Yes'

**Step 3** — Auto-print a C function skeleton from Python (optional)

In [26]:
def print_c_constants():
    print(f"// AGE_MEAN={AGE_MEAN}, AGE_STD={AGE_STD}")
    print(f"// SALARY_MEAN={SALARY_MEAN}, SALARY_STD={SALARY_STD}")
    print(f"static const double W0 = {W[0]:.10f};")
    print(f"static const double W1 = {W[1]:.10f};")
    print(f"static const double W2 = {W[2]:.10f};")
    print(f"static const double W3 = {W[3]:.10f};")
    print(f"static const double B  = {B:.10f};")

print_c_constants()


// AGE_MEAN=39.0, AGE_STD=7.664854858377946
// SALARY_MEAN=64847.22222222222, SALARY_STD=11430.808691675275
static const double W0 = -0.1136540919;
static const double W1 = -0.2498836760;
static const double W2 = -0.3341415990;
static const double W3 = -0.1597444172;
static const double B  = 0.1631880095;


C code template

```c
#include <math.h>
#include <string.h>

// TODO: Fill these from Python scaler values
static const double AGE_MEAN = /* e.g. 38.7778 */;
static const double AGE_STD  = /* e.g. 6.8650 */;
static const double SALARY_MEAN = /* e.g. 63777.7778 */;
static const double SALARY_STD  = /* e.g. 14000.1234 */;

// Logistic Regression params (from your output)
static const double W0 = -0.11365409;   // Age_scaled
static const double W1 = -0.24988368;   // Salary_scaled
static const double W2 = -0.33414160;   // isGermany
static const double W3 = -0.15974442;   // isSpain
static const double B  =  0.16318801;

static double sigmoid(double x) {
    // stable-ish sigmoid (good enough for many MCUs)
    if (x >= 0) {
        double z = exp(-x);
        return 1.0 / (1.0 + z);
    } else {
        double z = exp(x);
        return z / (1.0 + z);
    }
}

// Returns 1 for "Yes", 0 for "No"
int predict_purchased(const char* country, double age, double salary) {
    // One-hot with drop="first": Germany and Spain columns
    double isGermany = (strcmp(country, "Germany") == 0) ? 1.0 : 0.0;
    double isSpain   = (strcmp(country, "Spain")   == 0) ? 1.0 : 0.0;

    // Scale numeric features (same as StandardScaler)
    double age_scaled    = (age    - AGE_MEAN)    / AGE_STD;
    double salary_scaled = (salary - SALARY_MEAN) / SALARY_STD;

    // Linear score
    double score = B
        + W0 * age_scaled
        + W1 * salary_scaled
        + W2 * isGermany
        + W3 * isSpain;

    // Probability
    double p = sigmoid(score);

    // Threshold at 0.5
    return (p >= 0.5) ? 1 : 0;
}
```