## CKD Data SR

In [9]:
# Necessary imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  # Import DecisionTreeClassifier instead of LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


def process_column(col):
    if 'discrete' in str(col):
        return np.nan  # return NaN if 'discrete' is in column
    if '-' in str(col):
        low, high = map(float, str(col).split('-'))  # split on '-', convert to float
        return (low + high) / 2  # return the average
    else:
        try:
            return float(col)  # convert to float
        except ValueError:
            return np.nan  # if conversion to float fails, return NaN

# Load the dataset
df = pd.read_csv('ckd-dataset-v2 (2).csv')

# Added Affected - SR
# Apply process_column function to necessary columns
column_list = ['bp (Diastolic)', 'bp limit', 'sg', 'al', 'rbc', 'su', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sod', 'sc', 'pot', 'hemo', 'pcv', 'rbcc', 'wbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'grf', 'stage', 'affected', 'age']
for column_name in column_list:
    df[column_name] = df[column_name].apply(process_column)

# Convert 'class' to integer type
df['class'] = (df['class'] == 'ckd').astype(int)

# Fill missing values with the mean of the respective column
df = df.fillna(df.mean(numeric_only=True))

# One-hot encode categorical variables
enc = OneHotEncoder(drop='first')  # Create encoder object
df_encoded = pd.DataFrame(enc.fit_transform(df.select_dtypes(include=['object'])).toarray())  # Transform data

# Merge with the original df
df = df.join(df_encoded)
df = df.drop(df.select_dtypes(include=['object']).columns, axis=1)

# Split the dataset into features and target
X = df.drop(columns=['class'])
y = df['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert columns to string type to avoid issues with imputer
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Use mean imputation
imputer = SimpleImputer(strategy='mean')

# Fit on the training data
imputer.fit(X_train)

# Transform both training and testing data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

# Train the model using the imputed training data
model = DecisionTreeClassifier(max_depth=1)  # Replace LogisticRegression with DecisionTreeClassifier(max_depth=1)
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Exclude rows by index values - SR
filtered_df = df.drop([0, 1])


# Print out the accuracy and confusion matrix
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Accuracy: 1.0
Confusion Matrix:
[[13  0]
 [ 0 28]]


In [59]:
# Load the dataset
df = pd.read_csv('ckd-dataset-v2 (2).csv')

# Function to extract numeric value from a string
def extract_numeric_value(value):
    if isinstance(value, float):
        return value
    elif '-' in value:
        lower, upper = 'sg'.split('-')
        return (float(lower) + float(upper)) / 2
    elif '>' in value:
        return float('sg'[1:])
    elif '>=' in value:
        return float('sg'[2:])
    elif '<' in value:
        return float('sg'[1:])
    elif '<=' in value:
        return float('sg'[2:])
    else:
        return float('sg')

# Apply the function to the 'values' column
df['values'] = df['sg'].apply(extract_numeric_value)


print(df.head())

ValueError: could not convert string to float: 'sg'

## Affected column change

In [51]:
import pandas as pd

# Exclude rows by index values, removes descrete, blanks, and class
filtered_df = df.drop([0, 1])

# Assuming you have your dataset stored in a DataFrame called 'df'
print(filtered_df.head())

  bp (Diastolic) bp limit             sg     al class rbc   su pc pcc ba  \
2              0        0  1.019 - 1.021  1 - 1   ckd   0  < 0  0   0  0   
3              0        0  1.009 - 1.011    < 0   ckd   0  < 0  0   0  0   
4              0        0  1.009 - 1.011    ≥ 4   ckd   1  < 0  1   0  1   
5              1        1  1.009 - 1.011  3 - 3   ckd   0  < 0  0   0  0   
6              0        0  1.015 - 1.017    < 0   ckd   0  < 0  0   0  0   

         bgr           bu        sod      sc     pot  hemo          pcv  \
2      < 112       < 48.1  138 - 143  < 3.65  < 7.31   NaN  33.5 - 37.4   
3  112 - 154       < 48.1  133 - 138  < 3.65  < 7.31   NaN  33.5 - 37.4   
4      < 112  48.1 - 86.2  133 - 138  < 3.65  < 7.31   NaN  29.6 - 33.5   
5  112 - 154       < 48.1  133 - 138  < 3.65  < 7.31   NaN  41.3 - 45.2   
6  154 - 196       < 48.1  133 - 138  < 3.65  < 7.31   NaN  37.4 - 41.3   

          rbcc           wbcc htn dm cad appet pe ane                grf  \
2  4.46 - 5.05  

## Linear Regression

In [11]:
import statsmodels.api as sm

# Assuming you have your data stored in X (independent variables) and y (dependent variable)
X = filtered_df[['hemo','sg','grf']]
y = filtered_df[['affected']]

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Create and fit the linear regression model
model = sm.OLS(y, X)
result = model.fit()

# Print the summary of the linear regression model
print(result.summary())


                            OLS Regression Results                            
Dep. Variable:               affected   R-squared:                       0.601
Model:                            OLS   Adj. R-squared:                  0.595
Method:                 Least Squares   F-statistic:                     98.35
Date:                Sun, 25 Jun 2023   Prob (F-statistic):           7.08e-39
Time:                        19:32:22   Log-Likelihood:                -45.148
No. Observations:                 200   AIC:                             98.30
Df Residuals:                     196   BIC:                             111.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.4514      6.044      5.039      0.0

## Naive Bayes

In [50]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming you have your data stored in a pandas DataFrame called 'filtered_df'
# where 'hemo', 'sg', 'grf' are the input variables, and 'affected' is the target variable
X = filtered_df[['hemo', 'sg', 'grf']]  # Input variables
y = filtered_df['affected']  # Target variable

# Create a Gaussian Naive Bayes classifier
model = GaussianNB(var_smoothing=1e-9)  # Adjust var_smoothing if needed

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Access the class prior probabilities
print("Class Prior Probabilities:")
print(model.class_prior_)

# Access the mean of each feature per class
print("Mean:")
print(model.theta_)


ValueError: could not convert string to float: '1.015 - 1.017'

## Random Forest

In [48]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load your dataset into a pandas DataFrame
filtered_df = df.drop([0, 1])



def extract_numeric_value(value):
    if isinstance(value, float):
        return value
    elif '-' in value:
        lower, upper = value.split('-')
        return (float(lower) + float(upper)) / 2
    elif '>' in value:
        return float(value[1:])
    elif '>=' in value:
        return float(value[2:])
    elif '<' in value:
        return float(value[1:])
    elif '<=' in value:
        return float(value[2:])
    else:
        return float(value)

    
    
    

# Select the input variables (features) and target variable
X = filtered_df.drop('hemo', axis=1)  # Features
y = filtered_df['affected']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Train the model using the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


ValueError: could not convert string to float: '1.015 - 1.017'

## logistic Regression

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer





# Function to extract numeric value from a string
def extract_numeric_value(value):
    if isinstance(value, float):
        return value
    elif '-' in value:
        lower, upper = value.split('-')
        return (float(lower) + float(upper)) / 2
    elif '>' in value:
        return float(value[1:])
    elif '>=' in value:
        return float(value[2:])
    elif '<' in value:
        return float(value[1:])
    elif '<=' in value:
        return float(value[2:])
    else:
        return float(value)
    
    
    
# Apply the function to the 'hemo' column
filtered_df['hemo'] = filtered_df['hemo'].apply(extract_numeric_value)







# Select the input variable and target variable
X = filtered_df[['hemo']]  # Input variable
y = filtered_df['affected']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values in the training set
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

# Impute missing values in the test set
X_test_imputed = imputer.transform(X_test)

# Create a Logistic Regression model
model = LogisticRegression()

# Train the model using the imputed training data
model.fit(X_train_imputed, y_train)

# Make predictions on the imputed test data
y_pred = model.predict(X_test_imputed)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Visualize the logistic regression curve and the data points
sns.set_style("whitegrid")
plt.figure(figsize=(8, 6))
plt.scatter(X_test_imputed[y_test == 0], y_test[y_test == 0], color='blue', label='No')
plt.scatter(X_test_imputed[y_test == 1], y_test[y_test == 1], color='red', label='Yes')
plt.plot(X_test_imputed, model.predict_proba(X_test_imputed)[:, 1], color='black', linewidth=2, label='Logistic Regression')

plt.xlabel('hemo')
plt.ylabel('affected')
plt.legend()
plt.show()


ValueError: Found array with 0 feature(s) (shape=(160, 0)) while a minimum of 1 is required by LogisticRegression.

In [43]:
# Set the maximum number of columns to display
pd.set_option('display.max_columns', None)

# Print the head of the DataFrame
print(filtered_df.head(15), filtered_df['hemo'].apply(extract_numeric_value))


   bp (Diastolic) bp limit             sg     al   class rbc     su pc pcc ba  \
2               0        0  1.019 - 1.021  1 - 1     ckd   0    < 0  0   0  0   
3               0        0  1.009 - 1.011    < 0     ckd   0    < 0  0   0  0   
4               0        0  1.009 - 1.011    ≥ 4     ckd   1    < 0  1   0  1   
5               1        1  1.009 - 1.011  3 - 3     ckd   0    < 0  0   0  0   
6               0        0  1.015 - 1.017    < 0     ckd   0    < 0  0   0  0   
7               1        1        ≥ 1.023    < 0  notckd   0    < 0  0   0  0   
8               0        0  1.019 - 1.021  3 - 3     ckd   0    < 0  0   0  0   
9               0        0  1.019 - 1.021    < 0     ckd   0    < 0  0   0  0   
10              0        0        ≥ 1.023    < 0  notckd   0    < 0  0   0  0   
11              1        2  1.009 - 1.011    ≥ 4     ckd   0    < 0  1   1  1   
12              0        0        ≥ 1.023    < 0  notckd   0    < 0  0   0  0   
13              0        0  