<a href="https://colab.research.google.com/github/jrakhshanda/Bayesian-Methods/blob/master/Association_rule_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Association Rule Mining

In [3]:
!pip install apyori

In [94]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from apyori import apriori
import re

import pickle
import os
from os.path import isfile, join
import string
from ast import literal_eval
from google.colab import files

In [25]:
def save_object(path, filename, obj):
    print('Saving Object')
    path_file = join(path, filename)    
    pickle.dump(obj, open(path_file, 'wb'))
    print('Save complete')

def load_object(path, filename):
    print('Attempting to Load Object')
    path_file = join(path, filename)    
    obj = pickle.load(open(path_file, "rb" ))
    print('Load complete')
    return obj

def save_csv(path, filename, dataframe):
    print('Saving Dataframe to CSV')
    path_file = join(path, filename)    
    dataframe.to_csv(path_file, index=False)

def load_csv(path, filename):
    print('Loading Dataframe From CSV')
    path_file = join(path, filename)
    dataframe = pd.read_csv(path_file)
    return dataframe

In [6]:
def flatten(input_list):
    '''
    A function to flatten complex list.
    :param input_list: The list to be flatten
    :return: the flattened list.
    '''
    flat_list = []
    for i in input_list:
        if type(i) == list:
            flat_list += flatten(i)
        else:
            flat_list += [i]

    return flat_list

# Data and Pre processing

In [95]:
df = pd.read_csv('/content/drive/MyDrive/cancer_data/gene_cancer.csv')
df.head()

Unnamed: 0,PMID,Gene,Cancer,cancer_association,gene_association
0,33609553,"['hMC4R', 'hMC5R', 'MSH', 'receptors', 'melano...","['skin cancer', 'melanoma']","['melanoma', 'skin-cancer']","['hMC5R', 'MSH', 'melanocortin', 'hMC4R']"
1,33609447,"['BRCA2', 'VUSs']",['vus'],['vus'],"['BRCA2', 'VUSs']"
2,33608585,['UPR'],"['prostate cancer', 'pca']",['prostate-cancer'],['UPR']
3,33607478,"['Synaptophysin', 'synaptophysin']","['colorectal cancer', 'epithelial cancer', 'hi...","['colorectal-cancer', 'epithelial-cancer', 'mu...","['synaptophysin', 'Synaptophysin']"
4,33606355,"['BRCA1', 'BARD1', 'containing', 'RACK1', 'Aur...","['breast cancer', 'ovarian cancer']","['breast-cancer', 'ovarian-cancer']","['Aurora', 'BRCA1', 'containing', 'RACK1', 'BA..."


In [99]:
syno = pd.read_csv('/content/synonom_genes.txt',delimiter='\t')
syno.columns = ['synonm','gene']
syno['gene'] = syno['gene'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
syno['gene'] = syno['gene'].astype('string')
syno['synonm'] = syno['synonm'].astype('string')

In [None]:
#save_object('/content/drive/MyDrive/cancer_data','gene_dictionary',dictionary)
dictionary = load_object('/content/drive/MyDrive/cancer_data','gene_dictionary')

In [114]:
gene = [literal_eval(x) for x in df['Gene']]
gene = [[x for x in sublist if (x in dictionary)] for sublist in gene]

In [122]:
abr = pd.read_csv('/content/drive/MyDrive/cancer_data/cancer_abreviations')
abr.head()

Unnamed: 0,Abbreviation,Long_Form
0,NAD(+,Nicotinamide adenine dinucleotide
1,hMC1R,human melanocortin 1 receptor
2,UV,ultraviolet radiation
3,HDR,homology-directed DNA repair
4,PCa,Prostate cancer


In [159]:
def Filter(list1, list2): 
    return [x for x in list1 if all(y not in x for y in list2)]
stops = ['tumor','tumours','tumour','tumors','anti','cell','tissue','line','vus','biopsy','therapy','rt sns','specimen','deficient','raid','mutate','iii','ii','stage','poly',
         'related','brca','proficient','high','grade','xenografts','resistant']
stop_words = ['cancer','breast','ovarian','left kidney','ovarian']

In [160]:
cancer = [literal_eval(x) for x in df['cancer_association']]
cancer = [list(set(sublist)) for sublist in cancer]
cancer = [[x.replace('-', ' ') for x in sublist] for sublist in cancer]
cancer = [[x.replace('cancers', 'cancer') for x in sublist] for sublist in cancer]
cancer = [[re.sub("\S*\d\S*", "", x).strip() for x in sublist] for sublist in cancer] # removing words which contains difits
cancer = [[x for x in sublist if x] for sublist in cancer]
cancer = [Filter(x,stops) for x in cancer] 

In [161]:
cancer

[['melanoma', 'skin cancer'],
 [],
 ['prostate cancer'],
 ['epithelial cancer', 'colorectal cancer', 'mucinous adenocarcinoma'],
 ['breast cancer', 'ovarian cancer'],
 ['escc biopsies', 'escc', 'esophageal carcinoma'],
 ['mucinous', 'colorectal cancer'],
 ['bladder cancer'],
 ['nasopharyngeal carcinoma', 'npc'],
 ['gastric cancer'],
 ['gastric carcinoma'],
 ['ovarian cancer', 'breast cancer', 'prostate cancer', 'breast ovary cancer'],
 [],
 [],
 ['pdac', 'pancreatic ductal adenocarcinoma'],
 [],
 ['rcc', 'kidney cancer', 'renal cancer'],
 ['colorectal cancer', 'metastatic colorectal cancer'],
 ['mcrpc', 'prostate cancer'],
 ['colorectal cancer'],
 ['histotypes', 'endometrial cancer', 'endometrial cancer'],
 ['urothelial carcinoma',
  'utuc',
  'genitourinary malignancy',
  'upper tract uc'],
 ['lung cancer', 'lung adenocarcinoma'],
 ['cervical cancer', 'breast cancer'],
 ['ovarian'],
 ['triple negative breast cancer',
  'breast cancer',
  'benign breast disease',
  'triple negative bre

In [9]:
records = []
for i in range(0,len(gene)):
    records.append([gene[i],cancer[i]])
data = [flatten(x) for x in records]

# Apriori Algorithm

In [None]:
association_rules = apriori(data, min_support=0.004, min_confidence=0.3, min_lift=5, min_length=2)
association_results = list(association_rules)

In [None]:
len(association_results)

157

### Display Rule, Suuport and confidence

In [None]:
table= {'Entity1':[],'Entity2':[],'Rule':[],'Support':[],'Confidence':[],'Lift':[]}
for item in association_results:
    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])
    table['Rule'].append(items[0] + " -> " + items[1])
    table['Entity1'].append(items[0])
    table['Entity2'].append(items[1])

    #second index of the inner list
    print("Support: " + str(item[1]))
    table['Support'].append(item[1])
    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    table['Confidence'].append(item[2][0][2])
    table['Lift'].append(item[2][0][3])
    print("=====================================")

Rule: ATM -> ATR
Support: 0.005197698162242435
Confidence: 0.5511811023622047
Lift: 12.402725975042593
Rule: Arg280His -> Arg194Trp
Support: 0.0042695377761277145
Confidence: 0.41666666666666663
Lift: 83.13271604938271
Rule: Arg399Gln -> Arg194Trp
Support: 0.008130684982364952
Confidence: 0.7934782608695652
Lift: 50.16980506225761
Rule: XRCC1 -> Arg194Trp
Support: 0.010135511416372749
Confidence: 0.9891304347826085
Lift: 23.2075159066808
Rule: Arg280His -> Arg399Gln
Support: 0.004380917022461481
Confidence: 0.874074074074074
Lift: 55.26569292296992
Rule: Arg280His -> XRCC1
Support: 0.004937813254130314
Confidence: 0.9851851851851853
Lift: 23.114950316169832
Rule: Arg399Gln -> XRCC1
Support: 0.015704473733061074
Confidence: 0.9929577464788734
Lift: 23.297314374049176
Rule: BRAF -> KRAS
Support: 0.00727677742713941
Confidence: 0.3805825242718447
Lift: 25.82113423491722
Rule: BRCA2 -> BRCA1
Support: 0.033673658808242066
Confidence: 0.44157740993184036
Lift: 8.662700317927253
Rule: BRCA1 -

In [None]:
relations.drop(relations[(relations.Entity1 == 'breast') | (relations.Entity2 == 'breast')].index)
relations.sample(7)

Unnamed: 0,Entity1,Entity2,Rule,Support,Confidence,Lift
145,RAD51,BRCA2,RAD51 -> BRCA2,0.004047,0.495455,9.719642
29,BRCA2,breast,BRCA2 -> breast,0.004529,0.332425,6.521391
172,colorectal-carcinoma,MSH2,colorectal-carcinoma -> MSH2,0.003527,0.703704,12.065092
35,XPF,ERCC1,XPF -> ERCC1,0.003341,0.545455,14.389636
210,colorectal-cancer,MSH2,colorectal-cancer -> MSH2,0.011992,0.361702,11.780468
202,endometrial-cancer,MSH2,endometrial-cancer -> MSH2,0.0062,0.307551,9.004214
61,colorectal-cancer,MLH1,colorectal-cancer -> MLH1,0.036198,0.534539,4.200064


In [None]:
# from google.colab import files
# relations.to_csv('relations.csv')
# files.download('relations.csv')