In [None]:
import pandas as pd
import numpy as pn
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression

COL_WEIGHT = "Gewicht"
COL_HG = "Brustumfang"
COL_SHOULDER = "Schulterhöhe"
COL_RUMP = "Kreuzbeinhöhe"
COL_LENGTH = "Länge"
COL_MARKE = "Ohrmarkennummer"

alte_messung = pd.read_csv("Biometriedaten/alte_messung.csv")
alte_messung


In [None]:
erste_messung = pd.read_csv("Biometriedaten/erste_messung.csv")
erste_messung = erste_messung.drop(3)
erste_messung


In [None]:
zweite_messung = pd.read_csv("Biometriedaten/zweite_messung.csv")
zweite_messung = zweite_messung.drop(4)
zweite_messung

In [None]:
combined = pd.concat([alte_messung, erste_messung, zweite_messung])
combined

In [None]:
target = combined
# Create the pipeline
model = Pipeline([
    ('poly_features', PolynomialFeatures(degree=1, include_bias=False)),  # Generate polynomial features
    #('std_scaler', StandardScaler()),  # Standardize the features
    ('lin_reg', LinearRegression(fit_intercept=True, positive=False))  # Apply linear regression
])
model.fit(target[[COL_LENGTH, COL_SHOULDER,COL_RUMP]], target[COL_WEIGHT])
print(model.get_params()["lin_reg"].intercept_)
print(model.get_params()["lin_reg"].coef_)

In [None]:


truth = zweite_messung.drop([1, 7])

result = model.predict(truth[[COL_LENGTH, COL_SHOULDER,COL_RUMP]])
print(result)
print(truth[COL_WEIGHT])
mape = mean_absolute_percentage_error(truth[COL_WEIGHT], result)
mape

In [None]:
ref_ohrenmarke = {
    0:10610470885,
    1:10610436177,
    2:10610436171,
    3:10610536153,
    4:10610469975,
    5:10610436175,
}

permutation = [int(truth[truth[COL_MARKE] == marke].index[0]) for i, marke in ref_ohrenmarke.items()]
permutated_truth = truth.loc[permutation]

# length, shoulder, rump, bodyheight, weight, distance, angle, 
referenz_measurements = np.array (
[[78.95958041958042, 63.49408421842033, 65.15045163281391, 35.51987899755109, 46.77396860806468, 1.263, 16.3],
[72.14321678321679, 67.38661820608935, 66.41000055092864, 36.91614736507504, 42.83801138724803, 1.252, 26.3],
[70.24307692307693, 66.27756993434227, 63.05837368038849, 35.22179430796474, 35.66266929444433, 1.234, 24.3],
[81.80307692307692, 73.57663464540036, 72.50720681625211, 44.27431212673802, 61.3547022317708, 1.386, 25],
[77.26321678321679, 64.97177728851628, 70.26728264745492, 42.77138943758125, 53.67930545833971, 1.354, 21.6],
[68.36615384615384, 59.98241574476878, 68.97977810648409, 37.58898053338844, 43.72970663304724, 1.307, 23.9]])

ref_prediction = model.predict(referenz_measurements[:, 0:3:2])
ref_mape=mean_absolute_percentage_error(permutated_truth[COL_WEIGHT], ref_prediction)
print(ref_mape)

def create_plot(x_data, y_data, labels, name):
    plt.scatter(x_data, y_data)
    plt.title(name)
    for i, label in enumerate(labels):
        plt.text(x_data[i], y_data[i], label, fontsize=12, ha='right', va='bottom')
    plt.show()

bodyLengths  = referenz_measurements[:, 0]
shoulderHeights = referenz_measurements[:, 1]
rumpHeights = referenz_measurements[:, 2]
bodyHeights = referenz_measurements[:, 3]
create_plot(permutated_truth[COL_WEIGHT].to_numpy(), bodyHeights, range(len(bodyHeights)), "body height to body weight")
create_plot(permutated_truth[COL_WEIGHT].to_numpy(), permutated_truth[COL_LENGTH].to_numpy(), range(len(bodyLengths)), "body length to body weight")
create_plot(permutated_truth[COL_WEIGHT].to_numpy(), permutated_truth[COL_SHOULDER].to_numpy(), range(len(bodyHeights)), "shoulder heights to body weight")
create_plot(permutated_truth[COL_WEIGHT].to_numpy(), permutated_truth[COL_RUMP].to_numpy(), range(len(bodyHeights)), "rump heights to body weight")
create_plot(permutated_truth[COL_LENGTH].to_numpy(), bodyLengths, range(len(bodyLengths)), "body length to body length measured")
create_plot(permutated_truth[COL_WEIGHT].to_numpy(), shoulderHeights, range(len(bodyHeights)), "shoulder heights to shoulder height measured")
create_plot(permutated_truth[COL_WEIGHT].to_numpy(), permutated_truth[COL_RUMP].to_numpy(), range(len(bodyHeights)), "rump heights to rump height measured")
body_height_r = pearsonr(bodyHeights, permutated_truth[COL_WEIGHT])
print(body_height_r.correlation, body_height_r.pvalue)

In [None]:
performance = {}
truth = zweite_messung.drop([1, 7])
target = combined

for intercept in [True, False]:
    for positive in [True, False]:
        for poly in [True, False]:
            for features in [
                [COL_LENGTH, COL_RUMP, COL_SHOULDER],
                [COL_LENGTH, COL_RUMP],
            ]:
                model = Pipeline([
                    ('poly_features', PolynomialFeatures(degree=2 if poly else 1, include_bias=False)), 
                    ('lin_reg', LinearRegression(fit_intercept=intercept, positive=positive)) 
                ])
                model.fit(target[features], target[COL_WEIGHT])

                result = model.predict(truth[features])
                mape = mean_absolute_percentage_error(truth[COL_WEIGHT], result)

                if len(features) == 3:
                    ref_prediction = model.predict(referenz_measurements[:, 0:3])
                if len(features) == 2:
                    ref_prediction = model.predict(referenz_measurements[:, (0, 2)])
                if len(features) == 1:
                    ref_prediction = model.predict(referenz_measurements[:, 0:1])
                
                ref_mape = mean_absolute_percentage_error(permutated_truth[COL_WEIGHT], ref_prediction)

                performance[(intercept, positive, poly, tuple(features))] = (mape, ref_mape)

m_ref = 100
m_ref_key = ""
for key, value in performance.items():
    print(key)
    real, ref = value
    print("bench: {:.3f} ref: {:.3f}".format(real, ref))
    if ref < m_ref:
        m_ref = ref
        m_ref_key = key

print(m_ref)
print(m_ref_key)

# Results of Hyperparamter tuning
We should not use shoulder as these are not good enough for use.

These are the best results:

- (True, True, True, ('Länge', 'Kreuzbeinhöhe')) bench: 0.059 ref: 0.152
- (True, True, False, ('Länge', 'Kreuzbeinhöhe')) bench: 0.067 ref: 0.154
- (True, False, True, ('Länge', 'Kreuzbeinhöhe')) bench: 0.061 ref: 0.154
- (True, False, False, ('Länge', 'Kreuzbeinhöhe')) bench: 0.067 ref: 0.154


It seems like we should use intercept but the others only have minimal impact on the real world performance. Obviously the model will fit better on finer measurements.
Therefore i will choose the model that is the least complex

In [None]:
features = [COL_LENGTH, COL_RUMP]
model = Pipeline([
('lin_reg', LinearRegression(fit_intercept=True, positive=False)) 
])
model.fit(target[features], target[COL_WEIGHT])

f_values = truth[features].copy().to_numpy()
print(f_values)
rnd = (np.random.rand(*f_values.shape) * 0.02) + 1
f_values = f_values * rnd
print(f_values)

result = model.predict(f_values)
mape = mean_absolute_percentage_error(truth[COL_WEIGHT], result)
print(mape)

In [None]:
from sklearn.svm import SVR

features = [COL_LENGTH, COL_RUMP]
svr = SVR(kernel='poly')
svr.fit(combined[features], combined[[COL_WEIGHT]])
predicted = svr.predict(referenz_measurements[:, 0:3:2])
print(predicted)

mean_absolute_percentage_error(permutated_truth[[COL_WEIGHT]], predicted)