# Defining the Notebook for Defaulter Prediction System
# Ensure to run this code in a Jupyter Notebook environment

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Function Definitions

In [None]:
# Functions to calculate age and encode categorical variables
def calculate_age(dob, report_date):
    dob = datetime.strptime(dob, '%d/%m/%Y')
    today = datetime.strptime(report_date, '%d/%m/%Y')
    return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))

def calculate_bmi(weight, height):
    if height > 0:
        return weight / (height / 100) ** 2
    return None

def preprocess_data(file_path):
    df = pd.read_excel(file_path, header=4)  # Make sure this header is correct
    print("Columns in DataFrame:", df.columns)
    print(df.head())

    df.columns = df.columns.str.strip()  # Strip whitespace from column names
    df.ffill(inplace=True)  # Fill missing values

    if 'DOB' not in df.columns:
        raise KeyError("Column 'DOB' not found in the data.")

    # Convert relevant columns to datetime
    df['Next Appointment Date'] = pd.to_datetime(df['Next Appointment Date'], errors='coerce')
    df['Last Visit Date'] = pd.to_datetime(df['Last Visit Date'], errors='coerce')
    df['Self Visit Date'] = pd.to_datetime(df['Self Visit Date'], errors='coerce')

    # Calculate the number of days between visits
    df['Days Between Last and Next Visit'] = (df['Next Appointment Date'] - df['Last Visit Date']).dt.days
    df['Days Between Last and Self Visit'] = (df['Self Visit Date'] - df['Last Visit Date']).dt.days

    # Calculate Age and BMI
    df['Age'] = df.apply(lambda row: calculate_age(row['DOB'], '07/10/2024'), axis=1)
    df['BMI'] = df.apply(lambda row: calculate_bmi(row['Weight'], row['Height']), axis=1)

    # Create target variable based on converted date
    df['defaulter'] = (df['Next Appointment Date'] < pd.Timestamp.now()).astype(int)

    # Encode categorical features
    label_encoder = LabelEncoder()
    df['Sex'] = label_encoder.fit_transform(df['Sex'])
    df['AHD Client'] = label_encoder.fit_transform(df['AHD Client'].fillna('No'))
    df['Medical Cover'] = label_encoder.fit_transform(df['Medical Cover'].fillna('No'))

    # Handle Blood Pressure
    df[['Systolic_BP', 'Diastolic_BP']] = df['Blood Pressure'].str.split('/', expand=True).astype(float)

    return df

def split_data(df):
    features = ['Age', 'BMI', 'Systolic_BP', 'Diastolic_BP',
                'Days Between Last and Next Visit', 'Days Between Last and Self Visit',
                'Months Of Prescription', 'AHD Client', 'Medical Cover', 'Sex']
    target = 'defaulter'

    X = df[features]
    y = df[target]

    return train_test_split(X, y, test_size=0.2, random_state=42)


# Training the Model and saving it

In [None]:
def train_random_forest(file_path):
    df = preprocess_data(file_path)
    X_train, X_test, y_train, y_test = split_data(df)

    # Train Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

    # Create models directory if it doesn't exist
    if not os.path.exists('models'):
        os.makedirs('models')

    # Save the model
    joblib.dump(clf, 'models/random_forest_model.pkl')

# Evaluating the Model

In [None]:
def evaluate_model(file_path):
    df = preprocess_data(file_path)
    X = df[['Age', 'BMI', 'Systolic_BP', 'Diastolic_BP', 'Months Of Prescription', 'AHD Client', 'Medical Cover', 'Sex']]
    y = df['defaulter']

    model = joblib.load('models/random_forest_model.pkl')
    y_pred = model.predict(X)

    report = classification_report(y, y_pred)
    print(report)
    
    # Visualizing feature importances
    feature_importances = model.feature_importances_
    features = X.columns
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature_importances, y=features)
    plt.title('Feature Importances')
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.show()

# Running the Code
# Main code execution starts here

In [None]:
file_path = 'activeOnART.xls'
train_random_forest(file_path)
evaluate_model(file_path)