<a href="https://colab.research.google.com/github/kxk302/HIV/blob/main/HIV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving HIV_V3_codonmsa_macse_pre.tsv to HIV_V3_codonmsa_macse_pre.tsv
User uploaded file "HIV_V3_codonmsa_macse_pre.tsv" with length 741357 bytes


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls '/content/gdrive/MyDrive/Colab Notebooks'

HIV  HIV_V3_codonmsa_macse_pre.tsv  MBA.ipynb


In [1]:
import math

import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Create a string representing nucleotide plus position
# Pivot the data so we have all sample strings on a single line
#
def preprocess_input_file(df_in, codon=True):
  if df_in is None or df_in.shape[0] == 0:
    return df_in
  
  df = df_in.copy()

  if codon:
    # Create a new column called 'Label', which is a string concatentation of Nucleotide and Position values. 
    df["Label"] = df["Nucleotide"].astype(str) + df["Position"].astype(str)
  else:
    # Create a new column called 'Label', which is a string concatentation of AminoAcid and Position values. 
    df["Label"] = df["AminoAcid"].astype(str) + df["Position"].astype(str)

  # We do not need Nucleotide, and Position columns anymore
  df = df[["Sample", "Label"]]
  
  # Add a new column called 'Value', prepopulated with 1
  df["Value"] = 1

  df = pd.pivot_table(df, index="Sample", columns="Label", values="Value")

  # Set all data frame nan (not a number) values to 0
  df = df.fillna(0)

  # Convert all data framevalues to integer
  df = df.astype(int) 


  return df

In [2]:
def get_association_rules(in_file, min_support=0.20, 
                          min_confidence=0.80, min_lift=1.0, 
                          min_conviction=1.0, max_len=None, codon=True):
  
  # Read the input file and pick the needed columns
  if codon:
    df_in = pd.read_csv(in_file, sep='\t')[['Sample', 'Nucleotide', 'Position']]    
  else:
    df_in = pd.read_csv(in_file, sep='\t')[['Sample', 'AminoAcid', 'Position']]

  # Preprocess the data frame
  df = preprocess_input_file(df_in, codon)

  # Get frequent item sets, with support larger than min_support, using Apriori algorithm
  frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True, max_len=max_len)

  # Get association rules, with lift larger than min_lift  
  rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)

  # Filter association rules, keeping rules with confidence larger than min_confidence
  rules = rules[ (rules['confidence'] >= min_confidence) & (rules['conviction'] >= min_conviction) ]

  return rules

def get_association_rules_param(param_dict):
  in_file = param_dict.get('in_file', None)
  min_support = param_dict.get('min_support', 0.20)
  min_confidence = param_dict.get('min_confidence', 0.80)
  min_lift = param_dict.get('min_lift', 1.0)
  min_conviction = param_dict.get('min_conviction', 1.0)
  max_len = param_dict.get('max_len', None)
  codon = param_dict.get('codon', True)

  return get_association_rules(in_file, min_support, 
                               min_confidence, min_lift, 
                               min_conviction, max_len, codon)

# Add a new column that has the distance between the head/tail positions  
# Only makes sense if max_len is 2, that is we have rules in the form of A -> B
def add_distance_column(df_in):
  df = df_in.copy()
   
  head = df['antecedents']
  tail = df['consequents']

  head = head.astype(str)
  tail = tail.astype(str)

  # Remove forzenset chars before/after the position
  head = head.str.slice(13,-3,1)
  tail = tail.str.slice(13,-3,1)

  head = head.astype(int)
  tail = tail.astype(int)

  # Calculate absolute value of distance between head and tail positions
  distance = head.subtract(tail).apply(abs)
  df['distance'] = distance
  return df

def filter_rules_based_on_distance(df_in, min_distance):
  df = df_in.copy()

  df = add_distance_column(df)

  # Filter rules based on distance between head and tail positions  
  df = df[ df['distance'] >= min_distance ]

  return df

In [None]:
# Nucleotide file

pd.set_option('max_columns', 10, 'display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)

param_dict = {  
    "in_file": "https://raw.githubusercontent.com/kxk302/HIV/main/data/HIV_V3.fas_codon_macse_pre.tsv",
    "min_support": 0.010, 
    "min_confidence": 0.500, 
    "min_lift": 2.0, 
    "min_conviction": 2.0, 
    "max_len": 2
}

hiv_rules = get_association_rules_param(param_dict)

# Filter rules based on distance between head and tail positions
hiv_rules = filter_rules_based_on_distance(hiv_rules, 15)

num_rules = hiv_rules.shape[0]
hiv_rules_sorted = hiv_rules.sort_values('distance', ascending=False)
print('Number of rules: {}'.format(num_rules))
print('HIV dataset association rules: ')
print(hiv_rules_sorted.head(num_rules))


In [24]:
  # Codon file

pd.set_option('max_columns', 10, 'display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)

param_dict = {  
    "in_file": "https://raw.githubusercontent.com/kxk302/HIV/main/data/HIV_V3.fas_AA_macse_pre_tra.tsv",
    "min_support": 0.025, 
    "min_confidence": 0.025, 
    "min_lift": 1.5, 
    "min_conviction": 1.5, 
    "max_len": 3,
    "codon": False
}

hiv_rules = get_association_rules_param(param_dict)

# Filter rules based on distance between head and tail positions
# hiv_rules = filter_rules_based_on_distance(hiv_rules, 1)

num_rules = hiv_rules.shape[0]
hiv_rules = hiv_rules.sort_values('confidence', ascending=False)
# hiv_rules_sorted = hiv_rules.sort_values('distance', ascending=False)
print('Number of rules: {}'.format(num_rules))
print('HIV dataset association rules: ')
# print(hiv_rules_sorted.head(num_rules))
print(hiv_rules.head(num_rules))

Number of rules: 772
HIV dataset association rules: 
     antecedents consequents  antecedent support  consequent support   support  confidence      lift  leverage  conviction
1405  (T20, R12)       (Q19)            0.142330            0.362094  0.138643    0.974093  2.690164  0.087106   24.623156
1397   (S4, R12)       (Q19)            0.025811            0.362094  0.025074    0.971429  2.682805  0.015728   22.326696
1025   (N4, N12)       (R19)            0.039086            0.498525  0.037611    0.962264  1.930222  0.018125   13.289086
784    (G4, V11)       (R12)            0.041298            0.283186  0.039086    0.946429  3.342076  0.027391   13.380531
1500  (V11, T20)       (Q19)            0.075959            0.362094  0.071534    0.941748  2.600834  0.044030   10.950713
...          ...         ...                 ...                 ...       ...         ...       ...       ...         ...
112        (T11)   (Q9, A20)            0.064897            0.056047  0.025074    0.38

In [None]:
!ls

HIV_V3_codonmsa_macse_pre.tsv  sample_data


In [None]:
!more HIV_V3_codonmsa_macse_pre.tsv

Sample	Nucleotide	Position
0	T	2
1	C	2
2	C	2
3	C	2
4	C	2
5	T	2
6	C	2
7	T	2
8	T	2
9	C	2
10	T	2
11	C	2
12	C	2
13	C	2
14	C	2
15	C	2
16	C	2
17	T	2
18	T	2
19	C	2
20	T	2
21	T	2
[K

In [None]:
dd = pd.read_csv('HIV_V3_codonmsa_macse_pre.tsv',sep='\t')

In [None]:
preprocess_input_file(dd)

Label,A10,A11,A12,A123,A124,...,T83,T84,T85,T86,T9
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,1,0,0,0,...,0,0,0,0,0
1,0,1,0,0,0,...,0,0,0,0,0
2,0,0,0,0,0,...,0,0,0,0,0
3,0,0,0,0,0,...,0,0,0,1,0
4,0,0,0,0,0,...,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1351,1,0,0,0,0,...,1,0,0,0,0
1352,0,1,0,0,0,...,0,0,1,0,0
1353,0,1,0,0,0,...,0,0,1,0,0
1354,0,0,0,0,0,...,0,0,1,0,0
