In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

In [5]:
#change file path to where the cirrhosis.csv file is located
file_path = "./cirrhosis.csv"
cirrhosis_data = pd.read_csv(file_path)

cirrhosis_data.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [6]:
# Data Preprocessing
cirrhosis_data.drop(['ID', 'N_Days'], axis=1, inplace=True)
categorical_cols = [col for col in cirrhosis_data.columns if cirrhosis_data[col].dtype == 'object']
numerical_cols = [col for col in cirrhosis_data.columns if cirrhosis_data[col].dtype != 'object']
# Define the columns to encode
columns_to_encode = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']


# Create an instance of the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the columns to one-hot encoded features
encoded_features = encoder.fit_transform(cirrhosis_data[columns_to_encode])

# Convert the encoded features to a DataFrame
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out(columns_to_encode))

# Concatenate the encoded features with the original dataset
encoded_data = pd.concat([cirrhosis_data, encoded_df], axis=1)
encoded_data.dropna(inplace=True)
# Drop the original columns that were encoded
encoded_data.drop(columns_to_encode, axis=1, inplace=True)
encoded_data['Status'] = encoded_data['Status'].astype('category').cat.codes

# Calculate correlation coefficients with the target variable
correlations = encoded_data.corr()['Status'].abs().sort_values(ascending=False)

# Select top features with highest absolute correlation coefficients
selected_features = correlations.index[1:11]  # Exclude 'Status' column
print(selected_features)
numerical_cols_in_selected = [col for col in numerical_cols if col in selected_features]

Index(['Bilirubin', 'Copper', 'Prothrombin', 'Stage', 'Edema_N',
       'Hepatomegaly_N', 'Hepatomegaly_Y', 'Ascites_Y', 'Ascites_N',
       'Alk_Phos'],
      dtype='object')


In [7]:
# Create and train the logistic regression model
# Splitting the data into features (X) and target (y)
X = encoded_data[selected_features]
y = encoded_data['Status']

scaler = StandardScaler()
X[numerical_cols_in_selected] = scaler.fit_transform(X[numerical_cols_in_selected])
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_cols_in_selected] = scaler.fit_transform(X[numerical_cols_in_selected])


In [8]:
# Model Accuracy

train_accuracy = accuracy_score(y_train, logistic_model.predict(X_train))
test_accuracy = accuracy_score(y_test, logistic_model.predict(X_test))
model_accuracy = logistic_model.score(X_test, y_test)

print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)
print("Model accuracy:", model_accuracy)

# Testing the model
y_pred = logistic_model.predict(X_test)

# Display the classification report
print(classification_report(y_test, y_pred))

Training accuracy: 0.7227272727272728
Testing accuracy: 0.8392857142857143
Model accuracy: 0.8392857142857143
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        30
           2       1.00      0.65      0.79        26

    accuracy                           0.84        56
   macro avg       0.88      0.83      0.83        56
weighted avg       0.88      0.84      0.83        56



In [9]:
import joblib

# Assuming model is your trained logistic regression model
joblib.dump(logistic_model, 'logistic_regression_model.pkl')

# Save the scaler for later use 
joblib.dump(encoder, 'encoder.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']