In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import make_circles

In [None]:
train_df1=pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
train_df2=train_df1

In [None]:
train_df1['EJ'] = pd.factorize(train_df1['EJ'])[0]

In [None]:
train_df1['EJ'].head()

In [None]:
train_df1['BQ'].fillna(train_df1['BQ'].mean(),inplace=True) 
train_df1['DU'].fillna(train_df1['DU'].mean(),inplace=True)
train_df1['FL'].fillna(train_df1['FL'].mean(),inplace=True)
train_df1['CB'].fillna(train_df1['CB'].mean(),inplace=True)
train_df1['CC'].fillna(train_df1['CC'].mean(),inplace=True)
train_df1['FC'].fillna(train_df1['FC'].mean(),inplace=True)
train_df1['FS'].fillna(train_df1['FS'].mean(),inplace=True)
train_df1['GL'].fillna(train_df1['GL'].mean(),inplace=True)
train_df1['EL'].fillna(train_df1['EL'].mean(),inplace=True)

In [None]:
train_df1=train_df1.drop(['CB', 'CC', 'CR', 'CS', 'CU',
       'CW ', 'DA', 'DE', 'DH',  'DL', 'DN',
       'EE', 'EG','EP', 'EU', 'FI', 'GE', 'GF','GL','Id','AH','AY','BR','CF','BD '],axis=1)

In [None]:
x=train_df1.drop(['Class'],axis=1)

In [None]:
y=train_df1['Class']

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def k_fold(classifier,x_test, y_test):
    k_folds = 10  
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    cross_val_results = cross_val_score(classifier, x_test, y_test, cv=kf, scoring='accuracy')
    print("Cross-validation results (Accuracy):", cross_val_results)
    print("Mean Accuracy:", cross_val_results.mean())

In [None]:
import plotly.graph_objects as go
import numpy as np

def graph(fpr,tpr,thresholds):
    trace0 = go.Scatter(
        x=fpr,
        y=tpr,
        mode='lines',
        name='ROC curve'
    )
    n = 10  
    indices = np.arange(len(thresholds)) % n == 0  

    trace1 = go.Scatter(
        x=fpr[indices], 
        y=tpr[indices], 
        mode='markers+text', 
        name='Threshold points', 
        text=[f"Thr={thr:.2f}" for thr in thresholds[indices]], 
        textposition='top center'
    )
    trace2 = go.Scatter(
        x=[0, 1], 
        y=[0, 1], 
        mode='lines', 
        name='Random (Area = 0.5)', 
        line=dict(dash='dash')
    )

    data = [trace0, trace1, trace2]
    layout = go.Layout(
        title='Receiver Operating Characteristic',
        xaxis=dict(title='False Positive Rate'),
        yaxis=dict(title='True Positive Rate'),
        autosize=False,
        width=800,
        height=800,
        showlegend=False
    )
    fig = go.Figure(data=data, layout=layout)
    fig.show()


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
from imblearn.combine import SMOTETomek

sm = SMOTETomek(random_state=42)
x_train_res, y_train_res = sm.fit_resample(x_train,y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

clf = RandomForestClassifier(n_estimators=100, random_state=35,max_features="sqrt",
                             bootstrap=True,max_samples=27,max_depth=5)
clf.fit(x_train_res, y_train_res)
predicted_proba = clf.predict_proba(x_test)[:,1]

In [None]:
predicted_proba = clf.predict_proba(x_test)[:,1]
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(y_test,predicted_proba)

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold is:", optimal_threshold)

In [None]:
y_pred = (predicted_proba >= optimal_threshold).astype('int')

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
k_fold(clf,x_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

In [None]:
y_train_res.value_counts()

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
ada = AdaBoostClassifier(n_estimators=80,learning_rate=.9)
ada.fit(x_train_res, y_train_res)

predicted_proba = ada.predict_proba(x_test)[:,1]


In [None]:
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(y_test,predicted_proba)

In [None]:

optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Optimal threshold is:", optimal_threshold)

In [None]:

y_pred = (predicted_proba >= optimal_threshold).astype('int')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
k_fold(ada,x_test, y_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

In [None]:
test=pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
testdf2 = test

testdf2['EJ'] = pd.factorize(testdf2['EJ'])[0]
testdf2=testdf2.drop(['CB', 'CC', 'CR', 'CS', 'CU',
       'CW ', 'DA', 'DE', 'DH',  'DL', 'DN',
       'EE', 'EG','EP', 'EU', 'FI', 'GE', 'GF','GL','Id','AH','AY','BR','CF','BD '],axis=1)
def prediction(classifier):
    predictions = classifier.predict_proba(testdf2)
    print(predictions)

In [None]:
prediction(clf)

In [None]:
prediction(ada)