In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

%matplotlib inline

In [2]:
columns = ["Age", "Sex", "On Thyroxine", "Query on Thyroxine", 
           "On Antithyroid Medication", "Sick", "Pregnant", 
           "Thyroid Surgery", "I131 Treatment", "Query Hypothyroid", 
           "Query Hyperthyroid", "Lithium", "Goitre", "Tumor", 
           "Hypopituitary", "Psych", "TSH Measured", "TSH", "T3 Measured", 
           "T3", "TT4 Measured", "TT4", "T4U Measured", "T4U", 
           "FTI Measured", "FTI", "TBG Measured", "TBG", "Referral Source", "Category"]

In [3]:
hyper_data = pd.read_csv(r"C:\Users\hnrne\Downloads\allhyperdata.csv", names=columns)
hypo_data = pd.read_csv(r"C:\Users\hnrne\Downloads\allhypodata.csv", names=columns)
sick_data = pd.read_csv(r"C:\Users\hnrne\Downloads\sickdata.csv", names=columns)

In [4]:
def parse_row(row):
    row = row.split(".")[0]
    return row

def to_hyperthyroid(row):
    if row != "negative":
        row = "hyperthyroid"
    return row

def to_hypothyroid(row):
    if row != "negative":
        row = "hypothyroid"
    return row

def convert_category(dataframe, column):
    
    if column == 'Sex':
        conditionF = dataframe[column] == 'F' # For sex column
        conditionT = dataframe[column] == 'M' # For sex column
    else:
        conditionF = dataframe[column] == 'f'
        conditionT = dataframe[column] == 't'
    
    dataframe.loc[conditionF, column] = 0
    dataframe.loc[conditionT, column] = 1

In [5]:
hyper_data['Category'] = hyper_data['Category'] \
                            .apply(parse_row) \
                            .apply(to_hyperthyroid)

hypo_data['Category'] = hypo_data['Category'] \
                            .apply(parse_row) \
                            .apply(to_hypothyroid)

sick_data['Category'] = sick_data['Category'] \
                            .apply(parse_row)

In [6]:
thyroid_frames = [hyper_data, hypo_data, sick_data]
thyroid_data1 = pd.concat(thyroid_frames) \
                 .drop_duplicates() \
                 .drop(['Referral Source', 'TBG', 'TBG Measured'], axis=1)

In [7]:
thyroid_data=thyroid_data1

In [8]:
thyroid_data

Unnamed: 0,Age,Sex,On Thyroxine,Query on Thyroxine,On Antithyroid Medication,Sick,Pregnant,Thyroid Surgery,I131 Treatment,Query Hypothyroid,...,TSH,T3 Measured,T3,TT4 Measured,TT4,T4U Measured,T4U,FTI Measured,FTI,Category
0,41,F,f,f,f,f,f,f,f,f,...,1.3,t,2.5,t,125,t,1.14,t,109,negative
1,23,F,f,f,f,f,f,f,f,f,...,4.1,t,2,t,102,f,?,f,?,negative
2,46,M,f,f,f,f,f,f,f,f,...,0.98,f,?,t,109,t,0.91,t,120,negative
3,70,F,t,f,f,f,f,f,f,f,...,0.16,t,1.9,t,175,f,?,f,?,negative
4,70,F,f,f,f,f,f,f,f,f,...,0.72,t,1.2,t,61,t,0.87,t,70,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774,82,M,f,f,f,f,f,f,f,f,...,2.2,t,1,t,68,t,0.77,t,88,sick
2776,79,M,f,f,f,f,f,f,f,f,...,1.1,t,0.7,t,116,t,0.85,t,137,sick
2782,50,F,f,f,f,f,f,f,f,t,...,4.8,t,0.6,t,98,t,0.8,t,122,sick
2786,73,?,f,f,f,f,f,f,f,f,...,0.015,t,1.8,t,173,t,1,t,173,sick


In [10]:
# Binarize Category Columns
binary_cols = ['On Thyroxine', 'Query on Thyroxine', 'Sex',
               'On Antithyroid Medication', 'Sick', 'Pregnant', 
               'Thyroid Surgery', 'I131 Treatment', 'Query Hypothyroid', 
               'Query Hyperthyroid', 'Lithium', 'Goitre', 'Tumor', 
               'Hypopituitary', 'Psych', 'TSH Measured', 'T3 Measured', 
               'TT4 Measured', 'T4U Measured', 'FTI Measured']

for col in binary_cols: convert_category(thyroid_data, col)

In [11]:
# Convert '?' to np.nan and convert numeric data to numeric dtype
for col in thyroid_data.columns: 
    if col != 'Category':
        thyroid_data.loc[thyroid_data[col] == '?', col] = np.nan
        thyroid_data[col] = pd.to_numeric(thyroid_data[col])

In [12]:
curr_columns = thyroid_data.columns.difference(['Category'])

imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputed_data = imputer.fit_transform(thyroid_data.drop('Category', axis=1))
imputed_data = pd.DataFrame(imputed_data, columns=curr_columns)

thyroid_data = pd.concat([
                    imputed_data.reset_index(), 
                    thyroid_data['Category'].reset_index()], 
                    axis=1).drop('index', axis=1)

In [13]:
X = thyroid_data[['T3 Measured','T4U Measured' ,'Tumor','TSH Measured' ,'TT4 Measured','Age','FTI Measured','FTI' ,'Pregnant',
'T4U']]
y = thyroid_data['Category']

col_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

In [14]:
X

Unnamed: 0,T3 Measured,T4U Measured,Tumor,TSH Measured,TT4 Measured,Age,FTI Measured,FTI,Pregnant,T4U
0,1.300,2.5,109.0,125.0,1.14,41.0,0.0,0.0,0.0,1.0
1,4.100,2.0,106.0,102.0,0.97,23.0,0.0,0.0,0.0,1.0
2,0.980,1.9,120.0,109.0,0.91,46.0,0.0,1.0,0.0,0.0
3,0.160,1.9,106.0,175.0,0.97,70.0,1.0,0.0,0.0,1.0
4,0.720,1.2,70.0,61.0,0.87,70.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
3216,2.200,1.0,88.0,68.0,0.77,82.0,0.0,1.0,0.0,1.0
3217,1.100,0.7,137.0,116.0,0.85,79.0,0.0,1.0,0.0,1.0
3218,4.800,0.6,122.0,98.0,0.80,50.0,0.0,0.0,1.0,1.0
3219,0.015,1.8,173.0,173.0,1.00,73.0,0.0,0.0,0.0,1.0


In [15]:
thyroid_data.shape

(3221, 27)

In [16]:
thyroid_data.dtypes

Age                          float64
FTI                          float64
FTI Measured                 float64
Goitre                       float64
Hypopituitary                float64
I131 Treatment               float64
Lithium                      float64
On Antithyroid Medication    float64
On Thyroxine                 float64
Pregnant                     float64
Psych                        float64
Query Hyperthyroid           float64
Query Hypothyroid            float64
Query on Thyroxine           float64
Sex                          float64
Sick                         float64
T3                           float64
T3 Measured                  float64
T4U                          float64
T4U Measured                 float64
TSH                          float64
TSH Measured                 float64
TT4                          float64
TT4 Measured                 float64
Thyroid Surgery              float64
Tumor                        float64
Category                      object
d

In [23]:
thyroid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        3221 non-null   float64
 1   FTI                        3221 non-null   float64
 2   FTI Measured               3221 non-null   float64
 3   Goitre                     3221 non-null   float64
 4   Hypopituitary              3221 non-null   float64
 5   I131 Treatment             3221 non-null   float64
 6   Lithium                    3221 non-null   float64
 7   On Antithyroid Medication  3221 non-null   float64
 8   On Thyroxine               3221 non-null   float64
 9   Pregnant                   3221 non-null   float64
 10  Psych                      3221 non-null   float64
 11  Query Hyperthyroid         3221 non-null   float64
 12  Query Hypothyroid          3221 non-null   float64
 13  Query on Thyroxine         3221 non-null   float

In [17]:
thyroid_data.head()

Unnamed: 0,Age,FTI,FTI Measured,Goitre,Hypopituitary,I131 Treatment,Lithium,On Antithyroid Medication,On Thyroxine,Pregnant,...,T3 Measured,T4U,T4U Measured,TSH,TSH Measured,TT4,TT4 Measured,Thyroid Surgery,Tumor,Category
0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.3,1.0,2.5,1.0,125.0,1.0,1.14,1.0,109.0,negative
1,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.1,1.0,2.0,1.0,102.0,0.0,0.97,0.0,106.0,negative
2,46.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.98,0.0,1.9,1.0,109.0,1.0,0.91,1.0,120.0,negative
3,70.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.16,1.0,1.9,1.0,175.0,0.0,0.97,0.0,106.0,negative
4,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.72,1.0,1.2,1.0,61.0,1.0,0.87,1.0,70.0,negative


In [18]:
thyroid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        3221 non-null   float64
 1   FTI                        3221 non-null   float64
 2   FTI Measured               3221 non-null   float64
 3   Goitre                     3221 non-null   float64
 4   Hypopituitary              3221 non-null   float64
 5   I131 Treatment             3221 non-null   float64
 6   Lithium                    3221 non-null   float64
 7   On Antithyroid Medication  3221 non-null   float64
 8   On Thyroxine               3221 non-null   float64
 9   Pregnant                   3221 non-null   float64
 10  Psych                      3221 non-null   float64
 11  Query Hyperthyroid         3221 non-null   float64
 12  Query Hypothyroid          3221 non-null   float64
 13  Query on Thyroxine         3221 non-null   float

In [19]:
X = thyroid_data[['T3 Measured','T4U Measured' ,'Tumor','TSH Measured' ,'TT4 Measured','Age','FTI Measured','FTI' ,'Pregnant',
'T4U']]
y = thyroid_data['Category']

In [20]:
y

0       negative
1       negative
2       negative
3       negative
4       negative
          ...   
3216        sick
3217        sick
3218        sick
3219        sick
3220        sick
Name: Category, Length: 3221, dtype: object

In [21]:
from sklearn import preprocessing

In [22]:
l=preprocessing.LabelEncoder()

In [23]:
y

0       negative
1       negative
2       negative
3       negative
4       negative
          ...   
3216        sick
3217        sick
3218        sick
3219        sick
3220        sick
Name: Category, Length: 3221, dtype: object

In [25]:
l=l.fit(y)

In [26]:
y=l.transform(y)

In [27]:
y

array([2, 2, 2, ..., 3, 3, 3])

In [28]:
y=list(y)

In [29]:
y

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)


In [31]:
X_train

Unnamed: 0,T3 Measured,T4U Measured,Tumor,TSH Measured,TT4 Measured,Age,FTI Measured,FTI,Pregnant,T4U
499,0.40,1.0,149.0,154.0,1.03,65.0,0.0,1.0,0.0,1.0
1657,0.02,2.9,78.0,118.0,1.52,27.0,0.0,0.0,0.0,1.0
611,0.85,2.0,111.0,103.0,0.93,53.0,0.0,0.0,0.0,1.0
1205,0.05,1.9,217.0,230.0,1.06,56.0,0.0,0.0,0.0,0.0
2536,0.03,2.0,178.0,164.0,0.92,59.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1090,0.60,2.4,116.0,137.0,1.18,74.0,0.0,0.0,0.0,1.0
818,3.60,2.3,106.0,114.0,0.97,41.0,0.0,0.0,0.0,1.0
2564,0.05,1.9,114.0,105.0,0.92,50.0,1.0,0.0,0.0,0.0
765,2.40,2.7,109.0,155.0,1.43,37.0,0.0,0.0,0.0,1.0


In [32]:
from sklearn.linear_model import LogisticRegression  
classifier= LogisticRegression(random_state=0)  
classifier.fit(X_train, y_train)  
y_pred= classifier.predict(X_test)  
y_true=y_test

In [33]:
print(classification_report(y_true, y_pred))
print(accuracy_score(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.50      0.05      0.10        19
           1       0.60      0.05      0.10        55
           2       0.86      0.99      0.92       689
           3       0.62      0.12      0.20        43

    accuracy                           0.86       806
   macro avg       0.65      0.30      0.33       806
weighted avg       0.82      0.86      0.81       806

0.858560794044665


In [55]:
# save the model to disk
import pickle
filename = 'Our_Trained_logistic1_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [56]:
modelfile=r'C:\Users\hnrne\Our_Trained_logistic1_model.sav'
model = pickle.load(open(modelfile, 'rb'))

In [57]:
#0.80 3.8 100.0 193.0 1.93 28.0 0.0 0.0 0.0 1.0

#model.predict([[0.80,3.8,100.0,193.0,1.93,28.0,0.0,0.0,0.0,1.0]])
a=model.predict([[0.80,3.8,100.0,193.0,1.93,28.0,0.0,0.0,0.0,1.0]])

In [58]:

l.inverse_transform(a)

array(['negative'], dtype=object)

In [61]:
pickling_on = open("label.pickle","wb")
pickle.dump(l, pickling_on)

In [57]:
l.classes_

array(['hyperthyroid', 'hypothyroid', 'negative', 'sick'], dtype=object)

In [60]:
 d={0:"hyperthyroid", 1:"hypothyroid", 2:"negative", 3:"sick"}

In [62]:
d[0]

'hyperthyroid'

In [65]:
pickling_on = open("y.pickle","wb")
pickle.dump(y, pickling_on)