In [8]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
from collections import Counter

disease_data = pd.read_csv('../datasets/diseaserisk_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Heart_Disease']
X_encoded = pd.get_dummies(X, columns=['Exercise', 'Sex', 'Age_Category', 'Smoking_History', 'Alcohol_Consumption'])

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y1, test_size=0.2, random_state=42)

class_weights = {'No': 1, 'Yes': 1}
model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, class_weight=class_weights, random_state=42)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
recall = recall_score(Y_test, predictions, average='weighted')
f1 = f1_score(Y_test, predictions, average='weighted')

print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}") 

class_distribution = Counter(Y1)
print(class_distribution)

joblib.dump(model, 'random_forest_model_HD.pkl')

Recall: 0.9212034144630239
F1-Score: 0.8834370301001205
Accuracy: 0.9212034144630239
Counter({'No': 275231, 'Yes': 24081})


['random_forest_model_HD.pkl']

In [9]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
from collections import Counter

disease_data = pd.read_csv('../datasets/diseaserisk_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Cancer']
X_encoded = pd.get_dummies(X, columns=['Exercise', 'Sex', 'Age_Category', 'Smoking_History', 'Alcohol_Consumption'])

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y1, test_size=0.2, random_state=42)

class_weights = {'No': 1, 'Yes': 1.5}
model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, class_weight=class_weights, random_state=42)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
recall = recall_score(Y_test, predictions, average='weighted')
f1 = f1_score(Y_test, predictions, average='weighted')

print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}") 

class_distribution = Counter(Y1)
print(class_distribution)

joblib.dump(model, 'random_forest_model_C.pkl')

Recall: 0.8306967575965121
F1-Score: 0.7829572591268751
Accuracy: 0.8306967575965121
Counter({'No': 248124, 'Yes': 51188})


['random_forest_model_C.pkl']

In [10]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
from collections import Counter

disease_data = pd.read_csv('../datasets/diseaserisk_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Diabetes']
X_encoded = pd.get_dummies(X, columns=['Exercise', 'Sex', 'Age_Category', 'Smoking_History', 'Alcohol_Consumption'])

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y1, test_size=0.2, random_state=42)

class_weights = {'No': 1, 'Yes': 2.5}
model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, class_weight=class_weights, random_state=42)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
recall = recall_score(Y_test, predictions, average='weighted')
f1 = f1_score(Y_test, predictions, average='weighted')

print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}") 

class_distribution = Counter(Y1)
print(class_distribution)

joblib.dump(model, 'random_forest_model_D.pkl')

Recall: 0.8510098057230677
F1-Score: 0.8330210063995455
Accuracy: 0.8510098057230677
Counter({'No': 259141, 'Yes': 40171})


['random_forest_model_D.pkl']

In [11]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score
from collections import Counter

disease_data = pd.read_csv('../datasets/diseaserisk_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Arthritis']
X_encoded = pd.get_dummies(X, columns=['Exercise', 'Sex', 'Age_Category', 'Smoking_History', 'Alcohol_Consumption'])
encoded_columns = X_encoded.columns

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y1, test_size=0.2, random_state=42)

class_weights = {'No': 1, 'Yes': 1.5}
model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5,class_weight=class_weights, random_state=42)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
recall = recall_score(Y_test, predictions, average='weighted')
f1 = f1_score(Y_test, predictions, average='weighted')

print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}") 

class_distribution = Counter(Y1)
print(class_distribution)

joblib.dump(model, 'random_forest_model_A.pkl')
joblib.dump(encoded_columns, 'encoded_columns.pkl')

Recall: 0.7011008469338322
F1-Score: 0.7047558752603579
Accuracy: 0.7011008469338322
Counter({'No': 202066, 'Yes': 97246})


['encoded_columns.pkl']