# Pancreatic Cancer Prediction Model
This notebook trains and saves a machine learning model to predict pancreatitis likelihood based on patient data.

In [1]:
import numpy as np
import pandas as pd
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score


## Load Dataset

In [2]:
dataset_path = os.path.join(os.path.dirname(__file__), "Balanced_Pancreatitis_Dataset.csv")
df = pd.read_csv(dataset_path)
df.head()

## Handle Missing Values

In [3]:
# Fill numerical columns with median values
num_imputer = SimpleImputer(strategy='median')
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

# Fill categorical columns with mode (most frequent value)
cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

## Encode Categorical Variables

In [4]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

## Define Features and Target

In [5]:
X = df.drop(columns=['Target'])  # Features
y = df['Target']  # Target variable

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train Machine Learning Model

In [6]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

## Save the Model using joblib

In [7]:
# Save the trained model using joblib
joblib.dump(model, 'model.joblib')
print('Model saved successfully!')