In [None]:
import pandas as pd
import numpy as np
training = pd.read_csv('training_clustered.csv')

In [None]:
#Split into X and y 
#creates predictor dataframe with all variables except cluster column
X = training.loc[:,training.columns!='case_status']

#creates y dataframe with cluster membership as outcome
y = training['case_status']

#creates new dataframe, converting appropriate variables to dummies
X_logistic = pd.get_dummies(X, columns=['Smoke_status', 'Ethnicity','Alcohol_status','type'], drop_first=True)

#use gridsearch to optimise different hyperparameters for logstic regression

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#set up gridsearch
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2','elasticnet',solver='saga',]'}
grid = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=5, scoring='accuracy',)

#fit gridsearch
grid.fit(X_logistic, y)

#view best parameters
grid.best_params_

#view best score
grid.best_score_

#view best estimator
grid.best_estimator_



In [None]:
from sklearn.preprocessing import StandardScaler

#scales data otherwise it doesn't work 
X_logistic = StandardScaler().fit_transform(X_logistic)

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

#grid search for best parameters for logistic regression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define model
model = LogisticRegression()

# define grid
grid = dict()
grid['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
grid['penalty'] = ['none', 'l1', 'l2', 'elasticnet']


logmodel = LogisticRegression(multi_class='multinomial', solver='newton-cg',max_iter=200)

# use log model to predict case_status based on X_logistic
scores = cross_val_score(logmodel, X_logistic, y, scoring='accuracy', cv=cv, n_jobs=-1)

# summarize performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

#print f1 score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# use log model to predict case_status based on X_logistic
logmodel.fit(X_logistic, y)

# use log model to predict case_status based on X_logistic
predictions = logmodel.predict(X_logistic)

# print classification report
print(classification_report(y,predictions))

# print confusion matrix
print(confusion_matrix(y,predictions))

# print f1 score
print(f1_score(y,predictions, average='weighted'))


