In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pickle

In [2]:
#change file path to where the cirrhosis.csv file is located
file_path = "./cirrhosis.csv"
cirrhosis_data = pd.read_csv(file_path)

cirrhosis_data.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [4]:
# Data Preprocessing
if 'ID' in cirrhosis_data.columns and 'N_Days' in cirrhosis_data.columns:
    cirrhosis_data.drop(['ID', 'N_Days'], axis=1, inplace=True)

# Concatenate the encoded features with the original dataset
encoded_data = pd.concat([cirrhosis_data, encoded_df], axis=1)
encoded_data.dropna(inplace=True)
# Drop the original columns that were encoded
encoded_data.drop(columns_to_encode, axis=1, inplace=True)
encoded_data['Status'] = encoded_data['Status'].astype('category').cat.codes

# Calculate correlation coefficients with the target variable
correlations = encoded_data.corr()['Status'].abs().sort_values(ascending=False)

# Select top features with highest absolute correlation coefficients
selected_features = correlations.index[1:11]  # Exclude 'Status' column
print(selected_features)
numerical_cols_in_selected = [col for col in numerical_cols if col in selected_features]

KeyError: "['ID', 'N_Days'] not found in axis"

In [11]:
# Splitting the data into features (X) and target (y)
X = encoded_data[selected_features]
y = encoded_data['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
logistic_model = LogisticRegression(max_iter=1000, solver='liblinear', random_state=1)
logistic_model.fit(X_train, y_train)

In [12]:
# Model Accuracy

train_accuracy = accuracy_score(y_train, logistic_model.predict(X_train))
test_accuracy = accuracy_score(y_test, logistic_model.predict(X_test))
model_accuracy = logistic_model.score(X_test, y_test)

print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)
print("Model accuracy:", model_accuracy)

# Testing the model
y_pred = logistic_model.predict(X_test)

# Display the classification report
print(classification_report(y_test, y_pred))

Training accuracy: 0.7136363636363636
Testing accuracy: 0.8035714285714286
Model accuracy: 0.8035714285714286
              precision    recall  f1-score   support

           0       0.73      1.00      0.85        30
           2       1.00      0.58      0.73        26

    accuracy                           0.80        56
   macro avg       0.87      0.79      0.79        56
weighted avg       0.86      0.80      0.79        56



In [13]:
# Sample data for testing the model

print("Unique class labels in y:", y.unique())

# Sample data for testing the model
# Define the number of samples
num_samples = 50

# Define the features
features = {
    'Bilirubin': np.random.uniform(0.1, 5.0, num_samples),
    'Copper': np.random.uniform(0.1, 5.0, num_samples),
    'Prothrombin': np.random.uniform(10, 30, num_samples),
    'Stage': np.random.randint(1, 5, num_samples),
    'Edema_N': np.random.randint(0, 2, num_samples),
    'Hepatomegaly_N': np.random.randint(0, 2, num_samples),
    'Hepatomegaly_Y': np.random.randint(0, 2, num_samples),
    'Ascites_Y': np.random.randint(0, 2, num_samples),
    'Ascites_N': np.random.randint(0, 2, num_samples),
    'Alk_Phos': np.random.uniform(10, 300, num_samples)
}

# Create a DataFrame
df = pd.DataFrame(features)

X = df

# Make predictions
predictions = logistic_model.predict(X)

# Display predictions
print(predictions)

class_labels = {0: 'C', 1: 'CL', 2: 'D'}
decoded_predictions = [class_labels[pred] for pred in predictions]

print("Decoded Predictions:", decoded_predictions)

Unique class labels in y: [2 0 1]
[2 2 2 2 0 0 2 0 2 2 2 2 0 2 2 2 2 0 0 2 2 0 0 2 0 2 0 2 2 2 2 2 2 2 0 2 0
 2 0 2 0 0 0 2 2 0 2 0 0 2]
Decoded Predictions: ['D', 'D', 'D', 'D', 'C', 'C', 'D', 'C', 'D', 'D', 'D', 'D', 'C', 'D', 'D', 'D', 'D', 'C', 'C', 'D', 'D', 'C', 'C', 'D', 'C', 'D', 'C', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'C', 'D', 'C', 'D', 'C', 'D', 'C', 'C', 'C', 'D', 'D', 'C', 'D', 'C', 'C', 'D']


In [14]:
# Assuming model is your trained logistic regression model
pickle.dump(logistic_model, open('logistic_regression_model.pkl', 'wb'))