In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset = pd.read_excel("BD Kc colorectal_stage ESI.xlsx")
df = dataset.copy()
df.shape

(347, 189)

In [3]:
na = ["?","à compléter", 'A compléter', 'pas de n°tel','à compléter']
for col in df.columns:
    for i in range(df.shape[0]):
        if df[col][i] in na:
            df[col][i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][i] = np.nan


In [4]:
L = ['date_chir', 'D dernière onsultation', 'D contact tél', 
       'Decès date', 'statut_prog', 'Date recidive', 'Evolution recid',
       'remarques','Unnamed: 188']
for col in L:
    del df[col]
    

### Suppression des features qui ne sont pas nécessaires ( avec accord)

In [5]:
L = ['signif Incid Perop','signif CP', 'FistJPostOP',  'signif autres.1',
     'PéritoJPostop', 'signif autres Manifcliniq ', 'signif autres','SténoJPostOP',
    'FRVJPostOP','CStomiJPostOP', 'CHémorJPostOP', 'CUrinairJPostOP' ,   'CUrinairTrait',
     'DélaisPostRCCH','DuréeSéjPostOP','DuréeSéjRéa' ,'DécJPostOP','CauseDécès']
for col in L:
    del df[col]

In [6]:
df.shape

(347, 162)

In [7]:
# Remplacer les valeurs manquantes par NaN.

li_8 = ["atcd chir","Localisation","TDMSiège", "CmplcatParié", "Cplct Générl"]
li_6_8 = ["Type histologique"]


for col in li_8:
    df.loc[df[col] == 8, col] = np.nan

df.loc[df["Décès"] == 9, "Décès"] = np.nan
df.loc[df["trouble transi"] == 2, "trouble transi"] = np.nan
df.loc[df["Type histologique"] == 6 , "Type histologique"] = np.nan
df.loc[df["Type histologique"] == 8, "Type histologique"] = np.nan
df.loc[df["ACE"] == 'inf 0,50', "ACE"] = 0.5


### Remplissage des valeurs manquantes ( pour les complications )

In [8]:
L = ['FistEvolution', 'PéritoEvolution', 'SténoEvoluti','FRVEvolution', 'CStomiEvolut', 'CPérinEvolut', 'CUrinairEvolu' ]
for col in L:
    df.loc[df[col].isna() == False, col] = 1  # évolution
    df[col] = df[col].fillna(0)  # Pas d'évolution

In [9]:
for col in df.columns[df.columns.get_loc("CmplcatPerOP"):df.columns.get_loc("Cplct Générl")+1]:
    df[col] = df[col].fillna(0)

#### Remplissage des NaN pour les features ( avant Morbidité)

### Fillna with mean

In [10]:
numeric_features = ["Age","Hémoglobine","Globule blanc", "Plaquette","CRP" , "Urée" , "Créat" , "Glycémie" , "Albumine" ,
"Préalbumine" , "Protidemie" , "ACE" , "CA19-9"  , 'Histo Nbgg', 'Histo ggPosit']

for col in numeric_features:
    df[col] = df[col].fillna(df[col].mean())
    
    print(df[col].isnull().mean(), end='|')

0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|

###  Vérifier ( DécèsPostOP, Décès ).

In [11]:
df.loc[df['DécèsPostOP'] == 1, 'Décès'] = 1

### Remplacer par les valeurs dominantes ( TNM, Stade - Localisation - Anastomose )

In [12]:
df['Anastomose'] = df['Anastomose'].fillna(2) 
df['Localisation'] = df['Localisation'].fillna(7) 
df['Stade'] = df['Stade'].fillna(2) 
df['TNM'] = df['TNM'].fillna('pT3N0') 

In [13]:
for col in df.columns[df.columns.get_loc('BH'):df.columns.get_loc("TraitAdjAutre")+1]:
    df[col] = df[col].fillna(0)

In [14]:
df.loc[df['HistoLRL'] == 0.9, 'HistoLRL'] = 1

In [15]:
for col in df.columns[df.columns.get_loc('HTA'):df.columns.get_loc("TV result")+1]:
    df[col] = df[col].fillna(0)

In [16]:
df.isnull().mean().sort_values(ascending = False)

Décès             0.561960
Morbidité         0.025937
DécèsPostOP       0.017291
Sexe              0.002882
Date chirurg      0.002882
                    ...   
TNM               0.000000
HistoPerfT/R      0.000000
HistoIntéMes      0.000000
HistoEngainPN     0.000000
signif indicat    0.000000
Length: 162, dtype: float64

In [17]:
# how many total missing values do we have?
total_cells = np.product(df.shape)
missing_values_count = df.isnull().sum()
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

0.3789091685345288


In [18]:
df.shape

(347, 162)

In [19]:
#On remarque que la variable sexe est textuelle, donc on accorde à F -> 1 et M->0
df['Sexe']=np.where(df['Sexe']=="F",1,0)

### let's perform label encoding on TNM
ordinal_label = {k: i for i, k in enumerate(df['TNM'].unique(), 0)}
print(ordinal_label)
#data['TNM'] = data['TNM'].map(ordinal_label)


{'pT4N1Mx': 0, 'ypT3N2a': 1, 'pT3N0': 2, 'pT3N0Mx': 3, 'ypT3N2b': 4, 'pT2N1': 5, 'ypT0N0': 6, 'ypT2N1aMx': 7, 'pT2N0': 8, 'pT2N0Mx': 9, 'pT3N0M0': 10, 'pT3N1bMx': 11, 'pT3N1M1': 12, 'pT3N1M0': 13, 'pT4mN0': 14, 'pT3N1b': 15, 'pT2N1b': 16, 'pT3N0M1a': 17, 'pT4No': 18, 'pT3N1Mx': 19, 'ypT2N1': 20, 'ypT0N0Mx': 21, 'pT4N2a': 22, 'ypT3N0': 23, 'pT4N1b': 24, 'pT4N0Mx': 25, 'ypT0N1b': 26, 'ypT3N1a': 27, 'ypT3N0Mx': 28, 'pT3N0M1': 29, 'ypT4N0': 30, 'ypT2N0': 31, 'ypT2N1M1': 32, 'ypT4bN0': 33, 'ypT3N2': 34, 'ypT0N1Mx': 35, 'pT0N0': 36, 'pT3N1': 37, 'ypT2N1b': 38, 'ypT3N1': 39, 'pT4N0': 40, 'ypT2N0Mx': 41, 'ypT4N0Mx': 42, 'ypT1N0': 43, 'ypT2N1a': 44, 'pT2N1a': 45, 'ypT4N2a': 46, 'pT3N2aMx': 47, 'ypT2N2aMx': 48, 'pT1N0': 49, 'pT3N2b': 50, 'pT4N1': 51, 'ypT4N1b': 52, 'pT3N1a': 53, 'ypT3N0;pT3N1b': 54, 'ypT3N1b': 55, 'pT4N2b': 56, 'pT3N2': 57, 'pT3N0M1b': 58, 'ypT4N2b': 59, 'pT2N2b': 60, 'pT3N2a': 61, 'ypT1N1a': 62, 'pT4N1a': 63}


In [20]:
###performing label encoding on str var
for j in df.columns:
        if df.dtypes[j] == np.object:
            ordinal_label2 = {k: c for c, k in enumerate(df[j].unique(), 0)}
            df[j] = df[j].map(ordinal_label2)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347 entries, 0 to 346
Columns: 162 entries, N to Décès
dtypes: datetime64[ns](1), float64(119), int32(1), int64(41)
memory usage: 437.9 KB


In [21]:
df.loc[df["TRMA"] == -1, "TRMA"] = 0

In [22]:
## Extract the 1st column ( id column)
numPatient = df.iloc[:,0]

In [23]:
X = df.iloc[ : , :110]

In [24]:
#Feature selection
data=X.copy()
for col in numeric_features:
    del data[col]
del data['Date chirurg']
del data['N']

In [25]:
### It will zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(data)

VarianceThreshold(threshold=0)

In [26]:
constant_columns = [column for column in data.columns
                    if column not in data.columns[var_thres.get_support()]]

print(len(constant_columns))

1


In [27]:
for feature in constant_columns:
     print(feature)

TraitAdjAutre


In [28]:
data.drop(constant_columns,axis=1, inplace=True)

In [29]:
data

Unnamed: 0,Sexe,HTA,diabète,cardiopathie,obésité,atcd med autre,signif atcd autr,atcd chir,signif atcdchir,ANTCDTs Toxique,...,HistoEmbol,HistoEngainPN,HistoIntéMes,HistoPerfT/R,TNM,Stade,TraitemAdj,TraitAdjChim,signif Chimio,TraitAdjRChm
0,1,0.0,0.0,0.0,0.0,0.0,0,1.0,0,0.0,...,0.0,0.0,0.0,0.0,0,3,1.0,1.0,0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0,1.0,0,0.0,...,2.0,2.0,0.0,0.0,1,3,1.0,1.0,0,0.0
2,1,1.0,0.0,0.0,0.0,0.0,0,3.0,0,0.0,...,0.0,0.0,0.0,0.0,2,2,0.0,0.0,0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,...,2.0,2.0,0.0,2.0,3,2,0.0,0.0,0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0,2.0,0,0.0,...,0.0,0.0,0.0,0.0,2,2,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,1,0.0,0.0,0.0,0.0,1.0,10,1.0,2,0.0,...,2.0,2.0,0.0,2.0,2,2,0.0,0.0,11,0.0
343,1,0.0,0.0,0.0,0.0,0.0,0,1.0,32,0.0,...,2.0,2.0,0.0,2.0,40,2,1.0,1.0,0,0.0
344,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,...,2.0,2.0,0.0,2.0,63,3,1.0,1.0,0,0.0
345,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,...,2.0,2.0,1.0,2.0,6,0,0.0,0.0,0,0.0


In [30]:
quant_data=pd.DataFrame(X, columns=numeric_features)

In [31]:
df=pd.concat([numPatient, quant_data], axis=1)

In [32]:
df=pd.concat([df, data], axis=1)

In [33]:
df

Unnamed: 0,N,Age,Hémoglobine,Globule blanc,Plaquette,CRP,Urée,Créat,Glycémie,Albumine,...,HistoEmbol,HistoEngainPN,HistoIntéMes,HistoPerfT/R,TNM,Stade,TraitemAdj,TraitAdjChim,signif Chimio,TraitAdjRChm
0,1.0,60.000000,11.500000,8170.000000,331000.000000,12.000000,0.360000,10.000000,0.900000,35.513201,...,0.0,0.0,0.0,0.0,0,3,1.0,1.0,0,0.0
1,2.0,80.000000,10.700000,8680.000000,197000.000000,16.000000,0.340000,9.000000,2.310000,35.300000,...,2.0,2.0,0.0,0.0,1,3,1.0,1.0,0,0.0
2,3.0,77.000000,10.400000,3640.000000,147000.000000,28.310638,0.300000,10.000000,2.090000,37.800000,...,0.0,0.0,0.0,0.0,2,2,0.0,0.0,0,0.0
3,4.0,50.000000,16.000000,6890.000000,157000.000000,3.000000,0.520000,10.000000,1.020000,35.513201,...,2.0,2.0,0.0,2.0,3,2,0.0,0.0,0,0.0
4,5.0,68.000000,8.400000,6500.000000,408000.000000,10.000000,0.230000,10.000000,0.850000,40.000000,...,0.0,0.0,0.0,0.0,2,2,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,343.0,56.000000,9.500000,5060.000000,456000.000000,37.000000,0.230000,5.000000,1.070000,39.000000,...,2.0,2.0,0.0,2.0,2,2,0.0,0.0,11,0.0
343,344.0,52.000000,11.700000,7770.000000,362000.000000,20.000000,0.200000,8.000000,1.100000,44.000000,...,2.0,2.0,0.0,2.0,40,2,1.0,1.0,0,0.0
344,345.0,50.000000,11.000000,7480.000000,330000.000000,94.000000,0.110000,5.000000,1.050000,25.000000,...,2.0,2.0,0.0,2.0,63,3,1.0,1.0,0,0.0
345,346.0,33.000000,10.800000,7370.000000,412000.000000,7.000000,0.260000,7.000000,0.890000,34.000000,...,2.0,2.0,1.0,2.0,6,0,0.0,0.0,0,0.0


### Interface graphique

In [34]:
pip install tk

Note: you may need to restart the kernel to use updated packages.


In [48]:
from joblib import dump, load
model_loaded = load('votingp_model_saved.joblib')

In [49]:
def test():
    i=0
    #First condition
    if entry_FId.get() == "":
        print("Patient's ID Field is Empty!!")
        user = "                                   Patient's ID Field is Empty  !!                             "
        Label(win,text=". ",fg="red",bg="lemon chiffon",font = ("Calibri 12 bold")).place(x=12,y=460)
        Label(win,text=user,fg="blue",bg="lemon chiffon",font = ("Calibri 12 bold")).place(x=12,y=460)
        i=1
    #Second condition
    if not int(entry_FId.get()) in numPatient.unique().tolist():
        print("Patient's ID doesn't exist!!")
        user = "                                   Patient's ID doesn't exist   !!                              "
        Label(win,text=". ",fg="red",bg="lemon chiffon",font = ("Calibri 12 bold")).place(x=12,y=460)
        Label(win,text=user,fg="blue",bg="lemon chiffon",font = ("Calibri 12 bold")).place(x=12,y=460)
        i=1
    if i == 0:
        
        print("Testing...")
    # Test code will go here....
        dimension=(40, 40)
        flat_data = df.loc[df['N'] == int(entry_FId.get())]
        test=flat_data.iloc[:,1:]

        result = model_loaded.predict(test)
    # print(result)
        if result == 1:
            print("Morbidité")
            person = entry_FId.get()
            user = 'Patient N° '+ person + " aura des complications pendant l'opération  !!!         "
            a = user
            Label(win,text="                                                                            ",fg="red",bg="white",font = ("Calibri 12 bold")).place(x=12,y=460)
            Label(win,text=user,fg="red",bg="white",font = ("Calibri 12 bold")).place(x=12,y=460)
            MsgBox = tk.messagebox.showwarning ('warning','Morbidité = 1 !!',icon = 'warning')
        else:
            print("Pas de morbidité")
            person = entry_FId.get()
            user = 'Patient N° '+ person + " n'aura pas des complications pendant l'opération!!! "
            a = user
            Label(win,text=". ",fg="red",bg="white",font = ("Calibri 12 bold")).place(x=12,y=460)
            Label(win,text=user,fg="blue",bg="yellow",font = ("Calibri 12 bold")).place(x=12,y=460)
            MsgBox = tk.messagebox.showinfo ('information','Morbidité = 0 !!')


In [50]:
import tkinter as tk
from tkinter import *
from tkinter import messagebox
 
win =  Tk()

win.geometry("520x520")
win.configure(background="LightSkyBlue1")
win.title("cancer colorectal prediction")


title = Label(win,text="Prediction",bg="LightSteelBlue4",width="300",height="2",fg="White",font = ("Calibri 20 bold italic underline")).pack()
Label(win, text="Consulter les prévisions pour le patient N°: ", bg="LightSkyBlue1",font = ("Calibri 15")).place(x=12,y=150)
FId = Label(win, text="N° du patient : ",bg="LightSkyBlue1",font = ("Verdana 12")).place(x=60,y=220)
gap = Label(win,text="",bg="LightSkyBlue1").pack()

entry_FId = Entry(win,textvariable = FId,width=45)
entry_FId.place(x=200,y=222)

submit = Button(win, text="Voir les prévisions", width="54",height="2",activebackground="violet", bg="seashell2",command = test,font = ("Calibri 12 bold ")).place(x=40, y=300)

win.resizable(width="false", height="false") # false to prevent resizing

win.mainloop()

Testing...
Pas de morbidité
Testing...
Morbidité
