In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, f1_score
from collections import Counter

disease_data = pd.read_csv('../datasets/cs50fp_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Heart_Disease']
X_encoded = pd.get_dummies(X, columns=['Exercise', 'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'Smoking_History', 'Alcohol Consumption'])

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y1, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
recall = recall_score(Y_test, predictions, average='weighted')
f1 = f1_score(Y_test, predictions, average='weighted')

print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}") 

class_distribution = Counter(Y1)
print(class_distribution)

joblib.dump(model, 'random_forest_model_HD.pkl')

Recall: 0.9070572482936546
F1-Score: 0.8815050303476409
Accuracy: 0.9070572482936546
Counter({'No': 281333, 'Yes': 24875})


['random_forest_model_HD.pkl']

In [2]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, f1_score
from collections import Counter

disease_data = pd.read_csv('../datasets/cs50fp_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Cancer']
X_encoded = pd.get_dummies(X, columns=['Exercise', 'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'Smoking_History', 'Alcohol Consumption'])

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y1, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
recall = recall_score(Y_test, predictions, average='weighted')
f1 = f1_score(Y_test, predictions, average='weighted')

print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}") 

class_distribution = Counter(Y1)
print(class_distribution)

joblib.dump(model, 'random_forest_model_C.pkl')

Recall: 0.8052643610594037
F1-Score: 0.7735396799911771
Accuracy: 0.8052643610594037
Counter({'No': 253614, 'Yes': 52594})


['random_forest_model_C.pkl']

In [3]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, f1_score
from collections import Counter

disease_data = pd.read_csv('../datasets/cs50fp_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Diabetes']
X_encoded = pd.get_dummies(X, columns=['Exercise', 'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'Smoking_History', 'Alcohol Consumption'])

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y1, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
recall = recall_score(Y_test, predictions, average='weighted')
f1 = f1_score(Y_test, predictions, average='weighted')

print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}") 

class_distribution = Counter(Y1)
print(class_distribution)

joblib.dump(model, 'random_forest_model_D.pkl')

Recall: 0.8236177786486398
F1-Score: 0.7860824260633864
Accuracy: 0.8236177786486398
Counter({'No': 259141, 'Yes': 40171, 'Pre-diabetes': 6896})


['random_forest_model_D.pkl']

In [4]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, f1_score
from collections import Counter

disease_data = pd.read_csv('../datasets/cs50fp_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Arthritis']
X_encoded = pd.get_dummies(X, columns=['Exercise', 'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'Smoking_History', 'Alcohol Consumption'])

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y1, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
recall = recall_score(Y_test, predictions, average='weighted')
f1 = f1_score(Y_test, predictions, average='weighted')

print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}") 

class_distribution = Counter(Y1)
print(class_distribution)

joblib.dump(model, 'random_forest_model_A.pkl')

Recall: 0.6821135821821626
F1-Score: 0.6721475429450748
Accuracy: 0.6821135821821626
Counter({'No': 205851, 'Yes': 100357})


['random_forest_model_A.pkl']

In [5]:
disease_data = pd.read_csv('../datasets/cs50fp_cd.csv')
X = disease_data.drop(columns=['Heart_Disease', 'Cancer', 'Diabetes', 'Arthritis'])
Y1 = disease_data['Arthritis']
X

Unnamed: 0,Exercise,Sex,Age_Category,Height_(cm),Weight_(kg),Smoking_History,Alcohol Consumption
0,No,Female,70-74,150,32.66,Yes,0-7
1,No,Female,70-74,165,77.11,No,0-7
2,Yes,Female,60-64,163,88.45,No,0-7
3,Yes,Male,75-79,180,93.44,No,0-7
4,No,Male,80+,191,88.45,Yes,0-7
...,...,...,...,...,...,...,...
306203,Yes,Male,55-59,168,58.97,No,0-7
306204,Yes,Male,25-29,168,81.65,No,0-7
306205,Yes,Male,65-69,180,69.85,No,8-14
306206,Yes,Male,65-69,183,79.38,No,0-7
