<a href="https://colab.research.google.com/github/kxk302/HIV/blob/main/HIV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [15]:
!ls '/content/gdrive/MyDrive/Colab Notebooks'

HIV  HIV_V3_codonmsa_macse_pre.tsv  MBA.ipynb


In [26]:
import math

import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Create a string representing nucleotide plus position
# Pivot the data so we have all sample strings on a single line
#
def preprocess_input_file(df_in):
  if df_in is None or df_in.shape[0] == 0:
    return df_in
  
  df = df_in.copy()

  # Create a new column called 'Label', which is a string concatentation of Nucleotide and Position values. 
  df["Label"] = df["Nucleotide"].astype(str) + df["Position"].astype(str)

  # We do not need Nucleotide, and Position columns anymore
  df = df[["Sample", "Label"]]
  
  # Add a new column called 'Value', prepopulated with 1
  df["Value"] = 1

  df = pd.pivot_table(df, index="Sample", columns="Label", values="Value")

  # Set all data frame nan (not a number) values to 0
  df = df.fillna(0)

  # Convert all data framevalues to integer
  df = df.astype(int) 

  return df

In [27]:
def get_association_rules(in_file, min_support=0.20, 
                          min_confidence=0.80, min_lift=1.0, 
                          min_conviction=1.0, max_len=None):
  
  # Read the input file and pick the needed columns
  df_in = pd.read_csv(in_file, sep='\t')[['Sample', 'Nucleotide', 'Position']]

  # Preprocess the data frame
  df = preprocess_input_file(df_in)

  # Get frequent item sets, with support larger than min_support, using Apriori algorithm
  frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True, max_len=max_len)

  # Get association rules, with lift larger than min_lift  
  rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)

  # Filter association rules, keeping rules with confidence larger than min_confidence
  rules = rules[ (rules['confidence'] >= min_confidence) & (rules['conviction'] >= min_conviction) ]

  return rules

def get_association_rules_param(param_dict):
  in_file = param_dict.get('in_file', None)
  min_support = param_dict.get('min_support', 0.20)
  min_confidence = param_dict.get('min_confidence', 0.80)
  min_lift = param_dict.get('min_lift', 1.0)
  min_conviction = param_dict.get('min_conviction', 1.0)
  max_len = param_dict.get('max_len', None)

  return get_association_rules(in_file, min_support, 
                               min_confidence, min_lift, 
                               min_conviction, max_len)

In [28]:
pd.set_option('max_columns', 10, 'display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)

param_dict = {  
    "in_file": "https://raw.githubusercontent.com/kxk302/HIV/main/data/HIV_V3_codonmsa_macse_pre.tsv",
    "min_support": 0.050, 
    "min_confidence": 0.800, 
    "min_lift": 2.0, 
    "min_conviction": 2.0, 
    "max_len": 3
}

hiv_rules = get_association_rules_param(param_dict)
num_rules = hiv_rules.shape[0]
print('Number of rules: {}'.format(num_rules))
print('HIV dataset association rules: ')
print(hiv_rules.head(num_rules))


Number of rules: 586
HIV dataset association rules: 
     antecedents consequents  antecedent support  consequent support   support  confidence      lift  leverage  conviction
10         (A84)       (G52)            0.185841            0.328171  0.151917    0.817460  2.490958  0.090930    3.680454
14         (C13)       (T12)            0.085546            0.169617  0.076696    0.896552  5.285757  0.062186    8.027040
20         (C46)       (T45)            0.487463            0.485988  0.485251    0.995461  2.048324  0.248349  113.253933
21         (T45)       (C46)            0.485988            0.487463  0.485251    0.998483  2.048324  0.248349  337.761799
23         (C49)       (T12)            0.067109            0.169617  0.058260    0.868132  5.118204  0.046877    6.297075
...          ...         ...                 ...                 ...       ...         ...       ...       ...         ...
2764  (G13, T45)       (G12)            0.073746            0.121681  0.058997    0.80