In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestRegressor
import pickle

def get_clean_data():
    data = pd.read_csv("Thyroid_Diff.csv")
    data["Gender"] = data["Gender"].map({"M": 1, "F": 2})
    data["Smoking"] = data["Smoking"] .map({"Yes": 1, "No": 2})
    data["Hx Smoking"] = data["Hx Smoking"].map({"Yes": 1, "No": 2})
    data["Hx Radiothreapy"] = data["Hx Radiothreapy"].map({"Yes": 1, "No": 2})
    data["Thyroid Function"] = data["Thyroid Function"].map({
        "Euthyroid": 1,
        "Clinical Hyperthyroidism": 2,
        "Subclinical Hypothyroidism": 3,
        "Clinical Hypothyroidism": 4,
        "Subclinical Hyperthyroidism": 5})
    data["Physical Examination"] = data["Physical Examination"].map({
        "Multinodular goiter": 1,
        "Single nodular goiter-right": 2,
        "Single nodular goiter-left": 3,
        "Normal": 4,
        "Diffuse goiter": 5})
    data["Adenopathy"] = data["Adenopathy"].map({
        "Posterior": 1,
        "Extensive": 3,
        "No": 2,
        "Left": 4,
        "Bilateral": 5,
        "Right": 6})
    data["Pathology"] = data["Pathology"].map({
        "Papillary": 1,
        "Micropapillary": 2,
        "Follicular": 3,
        "Hurthel cell": 4})
    data["Focality"] = data["Focality"].map({"Uni-Focal": 1, "Multi-Focal": 2})
    data["Risk"] = data["Risk"].map({"Low": 1, "Intermediate": 2, "High": 3})
    data["Response"] = data["Response"].map({
        "Excellent": 1,
        "Structural Incomplete": 2,
        "Indeterminate": 3,
        "Biochemical Incomplete": 4})
    data["Recurred"] = data["Recurred"].map({"Yes": 1, "No": 2})
    data["N"] = data["N"].map({"N0": 1, "N1b": 2, "N1a": 3})
    data["M"] = data["M"].map({"M0": 1, "M1": 2})
    data["Stage"] = data["Stage"].map({
        "I": 1,
        "II": 2,
        "IVB": 3,
        "III": 4,
        "IVA": 5})
    data["T"] = data["T"].map({"T2": 1,
                               "T3a": 2,
                               "T1a": 3,
                               "T1b": 4,
                               "T4a": 5,
                               "T3b": 6,
                               "T4b": 7})
    return data



def create_model(data):
    X = data.drop(["Risk"],axis=1)
    y = data["Risk"]
    #scale the data
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    #split the data
    X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.30,random_state=42)
    #train the model
    model = LogisticRegression()
    model.fit(X_train,y_train)
    #testing the model
    y_pred = model.predict(X_test)
    print('Accuracy of our model: ', accuracy_score(y_test, y_pred))
    print("Classification report: \n", classification_report(y_test, y_pred))

    return model,scaler

def main():
  data = get_clean_data()

  model, scaler = create_model(data)

  with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
  with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
  

if __name__ == '__main__':
    main()

Accuracy of our model:  0.8956521739130435
Classification report: 
               precision    recall  f1-score   support

           1       0.94      0.95      0.94        78
           2       0.78      0.78      0.78        27
           3       0.89      0.80      0.84        10

    accuracy                           0.90       115
   macro avg       0.87      0.84      0.85       115
weighted avg       0.90      0.90      0.90       115

