In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

# Load the dataset
url = 'https://storage.googleapis.com/mbcc/datasets/us_chronic_disease_indicators.csv'
df = pd.read_csv(url)

# Filter the dataset for the specific topic "Cancer"
df_cancer = df[df['topic'] == 'Cancer']

# Select relevant columns
selected_columns = ['locationdesc', 'datavalue', 'topic']

df_cancer = df_cancer[selected_columns].dropna()  # Drop rows with missing values

# Convert categorical variables to numerical
df_cancer = pd.get_dummies(df_cancer, columns=['locationdesc', 'datavalue'])

# Separate features (X) and target variable (y)
X = df_cancer.drop('topic', axis=1)
y = df_cancer['topic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')



Accuracy: 1.00


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
#url = 'https://storage.googleapis.com/mbcc/datasets/us_chronic_disease_indicators.csv'
#df = pd.read_csv(url)

# Filter the dataset for the specific topic "Cancer"
# df_cancer = df[df['topic'] == 'Cancer']

# Select relevant columns
# selected_columns = ['locationdesc', 'datavalue', 'topic']

# df_cancer = df_cancer[selected_columns].dropna()  # Drop rows with missing values

# Convert categorical variables to numerical
# df_cancer = pd.get_dummies(df_cancer, columns=['locationdesc', 'datavalue'])

# Separate features (X) and target variable (y)
# X = df_cancer.drop('topic', axis=1)
# y = df_cancer['topic']

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10, 15]
}

# Initialize the RandomForestClassifier
# rf_model = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_

# save the best model to disk
filename = 'finalized_rf_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
test_accuracy = loaded_model.score(X_test, y_test)


print(f'Test Set Accuracy: {test_accuracy:.2f}')


Best Hyperparameters: {'max_depth': None, 'n_estimators': 50}
Best Cross-Validation Accuracy: 1.0
Test Set Accuracy: 1.00


# New section

Decision tree

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
#url = 'https://storage.googleapis.com/mbcc/datasets/us_chronic_disease_indicators.csv'
df = pd.read_csv(url)

# Filter the dataset for the specific topic "Cancer"
df_cancer = df[df['topic'] == 'Cancer']

# Select relevant columns
selected_columns = ['locationdesc', 'datavalue', 'stratification1', 'datavaluetype']

df_cancer = df_cancer[selected_columns].dropna()  # Drop rows with missing values

# Convert categorical variables to numerical
df_cancer = pd.get_dummies(df_cancer, columns=['locationdesc', 'stratification1'])

# Separate features (X) and target variable (y)
X = df_cancer.drop('datavaluetype', axis=1)
y = df_cancer['datavaluetype']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the decision tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# save the best model to disk
filename = 'finalized_dt_model.sav'
pickle.dump(dt_model, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
test_accuracy = loaded_model.score(X_test, y_test)


print(f'Accuracy: {test_accuracy:.2f}')


Accuracy: 0.69


In [None]:
import pandas as pd
url = 'https://storage.googleapis.com/mbcc/datasets/us_chronic_disease_indicators.csv'
df = pd.read_csv(url)

In [None]:
pip install xgboost



XG boost algorithm prediction

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
#url = 'https://storage.googleapis.com/mbcc/datasets/us_chronic_disease_indicators.csv'
#df = pd.read_csv(url)

# Filter the dataset for the specific topic "Cancer"
df_cancer = df[df['topic'] == 'Cancer']

# Select relevant columns
selected_columns = ['locationdesc', 'stratification1', 'topic']

df_selected = df_cancer[selected_columns].dropna()  # Drop rows with missing values

# Encode 'Topic' into numerical labels
label_encoder = LabelEncoder()
df_selected['topic'] = label_encoder.fit_transform(df_selected['topic'])

# Convert categorical variables to numerical
df_selected = pd.get_dummies(df_selected, columns=['locationdesc', 'stratification1'])

# Separate features (X) and target variable (y)
X = df_selected.drop('topic', axis=1)
y = df_selected['topic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)


# save the best model to disk
filename = 'finalized_xgb_model.sav'
pickle.dump(xgb_model, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
test_accuracy = loaded_model.score(X_test, y_test)


print(f'Accuracy: {accuracy:.2f}')


Accuracy: 1.00


Classification report of all the predictions and feature columns

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset from the provided URL
#url = 'https://storage.googleapis.com/mbcc/datasets/us_chronic_disease_indicators.csv'
#df = pd.read_csv(url)

# Assuming 'BinaryOutcome' is the binary target variable, and 'locationdesc' and 'stratification1' are features
target_variable = 'topic'
feature_columns = ['locationdesc', 'stratification1']

# Select relevant columns
X = df[feature_columns]
y = df[target_variable]

# Drop rows with missing values
X = X.dropna()
y = y[X.index]

# Encode categorical variables if needed
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(random_state=42)

# Fit the model to the training data
logistic_model.fit(X_train, y_train)

# save the best model to disk
filename = 'finalized_logit_model.sav'
pickle.dump(logistic_model, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))
test_accuracy = loaded_model.score(X_test, y_test)


# Make predictions on the test set
y_pred = loaded_model.predict(X_test)


print(f"Accuracy: {test_accuracy}")
print("Classification Report:\n", classification_report(y_test, y_pred))



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.1834373213353551


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
                                                  precision    recall  f1-score   support

                                        Alcohol       0.00      0.00      0.00      8458
                                      Arthritis       0.00      0.00      0.00     10978
                                         Asthma       0.00      0.00      0.00      8026
                                         Cancer       0.19      0.79      0.31     26117
                         Cardiovascular Disease       0.15      0.21      0.18     22689
                         Chronic Kidney Disease       0.00      0.00      0.00      3743
          Chronic Obstructive Pulmonary Disease       0.15      0.06      0.08     18840
                                       Diabetes       0.15      0.02      0.03     17217
                                     Disability       0.00      0.00      0.00       631
                                   Immunization       0.00      0.00      0.00      1

  _warn_prf(average, modifier, msg_start, len(result))
