<a href="https://colab.research.google.com/github/kxk302/HIV/blob/main/HIV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [15]:
!ls '/content/gdrive/MyDrive/Colab Notebooks'

HIV  HIV_V3_codonmsa_macse_pre.tsv  MBA.ipynb


In [65]:
import math

import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Create a string representing nucleotide plus position
# Pivot the data so we have all sample strings on a single line
#
def preprocess_input_file(df_in):
  if df_in is None or df_in.shape[0] == 0:
    return df_in
  
  df = df_in.copy()

  # Create a new column called 'Label', which is a string concatentation of Nucleotide and Position values. 
  df["Label"] = df["Nucleotide"].astype(str) + df["Position"].astype(str)

  # We do not need Nucleotide, and Position columns anymore
  df = df[["Sample", "Label"]]
  
  # Add a new column called 'Value', prepopulated with 1
  df["Value"] = 1

  df = pd.pivot_table(df, index="Sample", columns="Label", values="Value")

  # Set all data frame nan (not a number) values to 0
  df = df.fillna(0)

  # Convert all data framevalues to integer
  df = df.astype(int) 

  return df

In [67]:
def get_association_rules(in_file, min_support=0.20, 
                          min_confidence=0.80, min_lift=1.0, 
                          min_conviction=1.0, max_len=None):
  
  # Read the input file and pick the needed columns
  df_in = pd.read_csv(in_file, sep='\t')[['Sample', 'Nucleotide', 'Position']]

  # Preprocess the data frame
  df = preprocess_input_file(df_in)

  # Get frequent item sets, with support larger than min_support, using Apriori algorithm
  frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True, max_len=max_len)

  # Get association rules, with lift larger than min_lift  
  rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)

  # Filter association rules, keeping rules with confidence larger than min_confidence
  rules = rules[ (rules['confidence'] >= min_confidence) & (rules['conviction'] >= min_conviction) ]

  return rules

def get_association_rules_param(param_dict):
  in_file = param_dict.get('in_file', None)
  min_support = param_dict.get('min_support', 0.20)
  min_confidence = param_dict.get('min_confidence', 0.80)
  min_lift = param_dict.get('min_lift', 1.0)
  min_conviction = param_dict.get('min_conviction', 1.0)
  max_len = param_dict.get('max_len', None)

  return get_association_rules(in_file, min_support, 
                               min_confidence, min_lift, 
                               min_conviction, max_len)

# Add a new column that has the distance between the head/tail positions  
# Only makes sense if max_len is 2, that is we have rules in the form of A -> B
def add_distance_column(df_in):
  df = df_in.copy()
   
  head = df['antecedents']
  tail = df['consequents']

  head = head.astype(str)
  tail = tail.astype(str)

  # Remove forzenset chars before/after the position
  head = head.str.slice(13,-3,1)
  tail = tail.str.slice(13,-3,1)

  head = head.astype(int)
  tail = tail.astype(int)

  # Calculate absolute value of distance between head and tail positions
  distance = head.subtract(tail).apply(abs)
  df['distance'] = distance
  return df

def filter_rules_based_on_distance(df_in, min_distance):
  df = df_in.copy()

  df = add_distance_column(df)

  # Filter rules based on distance between head and tail positions  
  df = df[ df['distance'] >= min_distance ]

  return df

In [69]:
pd.set_option('max_columns', 10, 'display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)

param_dict = {  
    "in_file": "https://raw.githubusercontent.com/kxk302/HIV/main/data/HIV_V3_codonmsa_macse_pre.tsv",
    "min_support": 0.010, 
    "min_confidence": 0.500, 
    "min_lift": 2.0, 
    "min_conviction": 2.0, 
    "max_len": 2
}

hiv_rules = get_association_rules_param(param_dict)

# Filter rules based on distance between head and tail positions
hiv_rules = filter_rules_based_on_distance(hiv_rules, 15)

num_rules = hiv_rules.shape[0]
hiv_rules_sorted = hiv_rules.sort_values('distance', ascending=False)
print('Number of rules: {}'.format(num_rules))
print('HIV dataset association rules: ')
print(hiv_rules_sorted.head(num_rules))


Number of rules: 17
HIV dataset association rules: 
    antecedents consequents  antecedent support  consequent support   support  confidence       lift  leverage  conviction  distance
143       (G27)      (T159)            0.016224            0.248525  0.011799    0.727273   2.926356  0.007767    2.755408       132
85        (C37)      (T159)            0.089971            0.248525  0.067847    0.754098   3.034295  0.045487    3.055998       122
193       (T13)       (T85)            0.029499            0.107670  0.018437    0.625000   5.804795  0.015260    2.379548        72
29        (G76)        (A8)            0.015487            0.353245  0.012537    0.809524   2.291679  0.007066    3.395465        68
126       (G12)       (G52)            0.121681            0.328171  0.083333    0.684848   2.086864  0.043401    2.131765        40
168       (G51)       (T12)            0.030236            0.169617  0.019174    0.634146   3.738706  0.014046    2.269715        39
155       (T82)  