In [1]:
import ast

import pandas as pd
import numpy as np
import statsmodels as statsmodels
np.int = int
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score

In [2]:
scans=pd.read_excel("dummy_data.xlsx")
print(scans.shape)

(550, 138)


In [3]:
scans['in_range13'] = scans['13_18mm']/scans['6_26mm']
scans['in_range15'] = scans['15_18mm']/scans['6_26mm']

In [4]:
carbon = scans.copy()

In [5]:
#clear missing data
carbon['Age at Egg Collection'] = carbon['Age at Egg Collection'].replace(-100, np.nan)
carbon['Age at Egg Collection'] = round(carbon['Age at Egg Collection'],0)
carbon['BMI'] = np.where((carbon['BMI'] < 18) | (carbon['BMI'] > 50), np.nan, carbon['BMI'])
carbon['trigger_map'] = carbon['trigger_map'].replace('NONE', np.nan)
carbon['protocol_map'] = carbon['protocol_map'].replace('NONE', np.nan)

In [6]:
X = carbon[['Age at Egg Collection', 'in_range13', 'trigger_map', '6_26mm','live_birth']].copy()

X = X.dropna()
y = X['live_birth'].copy()

In [7]:
X.shape
y.shape

(550,)

In [8]:


# Assuming X is your dataframe
numerical_features = ['Age at Egg Collection', '6_26mm', 'in_range13']
categorical_features = ['trigger_map']

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Use ColumnTransformer to apply transformers to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the Logistic Regression model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])


In [115]:

lr_model = clf.fit(X, y)
print(lr_model.score(X,y))



0.6885096007314843


In [117]:

y_pred = lr_model.predict(X)
print(precision_score(y, y_pred))
print(recall_score(y, y_pred))


0.4170403587443946
0.03070320237702212


In [116]:
print(lr_model.named_steps['classifier'].coef_)
print(lr_model.named_steps['classifier'].intercept_)

[[-0.34057423  0.10598223 -0.08386985 -0.12797105  0.12786128]]
[-0.95281978]


In [49]:
nox_values = np.linspace(np.min(X['in_range13']), np.max(X['in_range13']))

pdp_values = []
for n in nox_values:
  X_pdp = X.copy()
  X_pdp['in_range13'] = n

  pdp_values.append(np.mean(lr_model.predict_proba(X_pdp)[:,1]))

n_bootstrap = 100

nox_values = np.linspace(np.min(X['in_range13']), np.max(X['in_range13']))

expected_value_bootstrap_replications = []

for _ in range(n_bootstrap):
    X_boot, y_boot = resample(X,y)

    lr_model_boot = clf.fit(X_boot, y_boot)

    bootstrap_model_predictions = []
    for n in nox_values:
        X_pdp = X_boot.copy()
        X_pdp['in_range13'] = n
        bootstrap_model_predictions.append(np.mean(lr_model_boot.predict_proba(X_pdp)[:,1]))
    expected_value_bootstrap_replications.append(bootstrap_model_predictions)

expected_value_bootstrap_replications = np.array(expected_value_bootstrap_replications)

prediction_se = np.std(expected_value_bootstrap_replications, axis=0)

In [121]:
pdp_values_np = np.array(pdp_values)
df = pd.DataFrame([nox_values*1, pdp_values_np, (pdp_values_np + prediction_se*3), (pdp_values_np - prediction_se*3)])
df.to_csv("PDP_inrange13.csv")