<a href="https://colab.research.google.com/github/jrbobes/jrbobes-hitl-ml/blob/master/PAAD_RuleEngine_POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **PAAD - RuleEngine - PoC**

author: jose.bobes at udc.es

# Importing required libraries

In [1]:
# Importing Required Libraries
import sys
print('Python: {}'.format(sys.version))

import scipy
print('scipy: {}'.format(scipy.__version__))

import numpy
print('numpy: {}'.format(numpy.__version__))

import matplotlib
import matplotlib.pyplot as plt
print('matplotlib: {}'.format(matplotlib.__version__))

import pandas as pd
print('pandas: {}'.format(pd.__version__))





Python: 3.8.16 (default, Dec  7 2022, 01:12:13) 
[GCC 7.5.0]
scipy: 1.7.3
numpy: 1.21.6
matplotlib: 3.2.2
pandas: 1.3.5


# Preparing the Pancreas data

In [23]:
# Loading Data
url = "https://raw.githubusercontent.com/jrbobes/jrbobes-hitl-ml/master/Libro1_Adapted.csv"
columns = ['ns10:file_uuid','ns2:gender', 'ns5:race', 'ns5:age_at_initial_pathologic_diagnosis', 'ns3:pathologic_stage', 'ns3:pathologic_T', 'ns3:pathologic_N', 'ns3:pathologic_M', 'ns5:vital_status', 'ns5:days_to_death', 'therapy_type']
headerNames = ['file_uuid','gender', 'race', 'age_at_initial_pathologic_diagnosis', 'pathologic_stage', 'pathologic_T', 'pathologic_N', 'pathologic_M', 'vital_status', 'days_to_death', 'therapy_type']

dataset = pd.read_csv(url, sep=';', usecols=columns, encoding = "ISO-8859-1")
dataset.columns= headerNames # Columns name update to remove the semicolon.

print(dataset.shape)
print(dataset.head())

(205, 11)
                              file_uuid  gender                       race  \
0  1AF64746-0F41-408F-9CB3-D567BEBA1217  FEMALE                      WHITE   
1  0D7B23AC-0988-4E95-9AF3-82180A1DAB0A  FEMALE                      WHITE   
2  290AC731-F653-432D-8C35-8E6C178BD2C6  FEMALE                      WHITE   
3  E2FC1DD6-0B4D-409D-9D1C-0F8E0ADB6113  FEMALE  BLACK OR AFRICAN AMERICAN   
4  40A76730-988D-4FF1-A17A-91AA85DD7C76    MALE                      WHITE   

   age_at_initial_pathologic_diagnosis pathologic_stage pathologic_T  \
0                                   53        Stage IIB           T3   
1                                   54        Stage IIA           T3   
2                                   58         Stage IV           T3   
3                                   66        Stage IIA           T3   
4                                   64        Stage IIA           T3   

  pathologic_N pathologic_M vital_status  days_to_death  therapy_type  
0           N1  

In [25]:
# Feature Selection
# Split dataset in features and target variables (REF: https://www.datacamp.com/tutorial/decision-tree-classification-python)
feature_cols = ['gender', 'race', 'age_at_initial_pathologic_diagnosis', 'pathologic_stage', 'pathologic_T', 'pathologic_N', 'pathologic_M']

X = dataset[feature_cols] # Features
X_encoded = pd.get_dummies(X)
y = dataset.therapy_type # Target variable

# Installing the rules engine

In [4]:
!pip install rule_engine

import rule_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rule_engine
  Downloading rule-engine-3.5.0.tar.gz (40 kB)
[K     |████████████████████████████████| 40 kB 3.3 MB/s 
[?25hCollecting ply>=3.9
  Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 5.5 MB/s 
Building wheels for collected packages: rule-engine
  Building wheel for rule-engine (setup.py) ... [?25l[?25hdone
  Created wheel for rule-engine: filename=rule_engine-3.5.0-py3-none-any.whl size=44366 sha256=aea5dcc54aafb6f292fdf9210de98284c56e0e0abf84ba8b8eac50063d348bca
  Stored in directory: /root/.cache/pip/wheels/d6/25/45/26c5f3f9fe4c0094cd95b32ebc40bd315cf803c8c1aaaecbc1
Successfully built rule-engine
Installing collected packages: ply, rule-engine
Successfully installed ply-3.11 rule-engine-3.5.0


# Explore the data

In [26]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 11 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   file_uuid                            205 non-null    object 
 1   gender                               205 non-null    object 
 2   race                                 200 non-null    object 
 3   age_at_initial_pathologic_diagnosis  205 non-null    int64  
 4   pathologic_stage                     202 non-null    object 
 5   pathologic_T                         205 non-null    object 
 6   pathologic_N                         205 non-null    object 
 7   pathologic_M                         205 non-null    object 
 8   vital_status                         205 non-null    object 
 9   days_to_death                        67 non-null     float64
 10  therapy_type                         205 non-null    object 
dtypes: float64(1), int64(1), object(

In [29]:
print(dataset['days_to_death'])

print("Age at initial diagnosis MEAN: " + str(dataset['age_at_initial_pathologic_diagnosis'].mean()))

0       486.0
1         NaN
2         NaN
3      1332.0
4       117.0
        ...  
200     394.0
201       NaN
202     110.0
203     741.0
204     123.0
Name: days_to_death, Length: 205, dtype: float64
Age at initial diagnosis MEAN: 64.8


# Create Pancreatic Adenocarcinoma NCCN Guidelines rules

In [43]:
# Establish a context
#paadRulesContext = rule_engine.Context(default_value=None)

# Declare all the rules
ruleChemotherapy = rule_engine.Rule(
  'pathologic_T == "N1"'
)

ruleOther = rule_engine.Rule(
  'pathologic_T == "T0"'
)

In [50]:
# Compose rules

# We use a dictionary to fit into the matches function of the Rule Engine library
dictionaryset = dataset.to_dict('records')
#print(dictionaryset)

#print('Chemotherapy results: ')
#for row in rulePan1.filter(dictionaryset):
#		print(row)
  

for idx,x in enumerate(dictionaryset):
  if ruleChemotherapy.matches(dictionaryset[idx]):
    print(dictionaryset[idx])
    print(str(idx) + " -> Chemotherapy")
  elif ruleOther.matches(dictionaryset[idx]):
    print(str(idx) + " -> Other")
  else:
    print("N/A")

{'file_uuid': '1AF64746-0F41-408F-9CB3-D567BEBA1217', 'gender': 'FEMALE', 'race': 'WHITE', 'age_at_initial_pathologic_diagnosis': 53, 'pathologic_stage': 'Stage IIB', 'pathologic_T': 'T3', 'pathologic_N': 'N1', 'pathologic_M': 'M0', 'vital_status': 'Dead', 'days_to_death': 486.0, 'therapy_type': 'Other'}
0 -> Chemotherapy
{'file_uuid': '0D7B23AC-0988-4E95-9AF3-82180A1DAB0A', 'gender': 'FEMALE', 'race': 'WHITE', 'age_at_initial_pathologic_diagnosis': 54, 'pathologic_stage': 'Stage IIA', 'pathologic_T': 'T3', 'pathologic_N': 'N0', 'pathologic_M': 'MX', 'vital_status': 'Alive', 'days_to_death': nan, 'therapy_type': 'Other'}
1 -> Chemotherapy
{'file_uuid': '290AC731-F653-432D-8C35-8E6C178BD2C6', 'gender': 'FEMALE', 'race': 'WHITE', 'age_at_initial_pathologic_diagnosis': 58, 'pathologic_stage': 'Stage IV', 'pathologic_T': 'T3', 'pathologic_N': 'N0', 'pathologic_M': 'M1', 'vital_status': 'Alive', 'days_to_death': nan, 'therapy_type': 'Other'}
2 -> Chemotherapy
{'file_uuid': 'E2FC1DD6-0B4D-40