In [1]:
import pandas as pd
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from feature_engine.outliers import Winsorizer

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error


In [3]:
dataset = pd.read_csv(
    "drug_regulatory_classification_dataset.csv")

In [4]:
X = dataset.iloc[:, 1:]
y = dataset.iloc[:, 0]


In [5]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [6]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('winsorizer', Winsorizer(capping_method='iqr', tail='both', fold=1.5)),
    ('scaler', MinMaxScaler())
])

In [7]:
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

In [8]:
preprocess_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numeric_features),
        ('cat', cat_pipeline, categorical_features)
    ],
    remainder='drop'
)

In [9]:
X_clean = preprocess_pipeline.fit_transform(X)

X_clean = pd.DataFrame(X_clean)

In [10]:
joblib.dump(preprocess_pipeline, 'preprocessing_pipeline.pkl')

print(X_clean.head())


         0         1         2         3         4         5         6   \
0  0.725745  0.616880  0.200293  0.666667  0.362222  0.345017  0.752941   
1  0.217885  0.179572  0.336568  0.333333  0.791111  0.246048  0.447059   
2  0.388370  0.248978  0.130030  0.666667  0.064444  0.200687  0.976471   
3  0.744595  0.662990  0.297228  1.000000  0.080000  0.219931  0.517647   
4  0.704833  0.694614  0.415553  0.333333  0.960000  0.060481  0.470588   

         7         8         9   ...   26   27   28   29   30   31   32   33  \
0  0.622706  0.377294  0.410003  ...  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0   
1  0.505882  0.494118  0.655259  ...  1.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0   
2  0.582118  0.417882  0.051006  ...  0.0  1.0  0.0  1.0  0.0  0.0  1.0  1.0   
3  0.496471  0.503529  0.354839  ...  0.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0   
4  0.964235  0.035765  0.299543  ...  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0   

    34   35  
0  0.0  0.0  
1  0.0  0.0  
2  0.0  0.0  
3  0.0  0.0 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y, test_size=0.2, random_state=42
)

In [12]:
mlr = LinearRegression()
mlr.fit(X_train, y_train)


In [13]:
y_pred = mlr.predict(X_test)


In [14]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print("R2 Score:", r2)
print("MSE:", mse)
print("RMSE:", rmse)

R2 Score: -0.0005014701106351716
MSE: 46034.67649923033
RMSE: 214.55693067162926


In [15]:
joblib.dump(mlr, 'mlr_model.pkl')

mlr_model = joblib.load('mlr_model.pkl')