<a href="https://colab.research.google.com/github/hahajjjun/MLCompetition_Toy_Projects/blob/main/Project%203%3A%20Clone_hard_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting.py

In [48]:
import pandas as pd
import random

def Normalize(arr):

  sum = 0
  for entry in arr:
    sum += entry
  
  if sum==0:
    return arr
  for i in range(len(arr)):
    arr[i] = arr[i]/sum
  return arr

# Init.py

In [2]:
#Hyperparameters
B,C,M = 3,3,8000
x = 20 #Lower Bound of depth
y = 30 #Upper Bound of depth
zero_probability = 0.4 #Probability of creating [0,0]

blocks = ["block"+str(i) for i in range(1,B+1)]
clones = ["clone"+str(i) for i in range(1,C+1)]

observation = dict()
for block in blocks:
  observation[block] = []

params = dict()
last_block = 'block' + str(B)
for block in blocks:
  params[block] = []
  for clone in clones:
    params[block].append(random.random())

#Normalization
for block in blocks:
    params[block] = Normalize(params[block])

print("----Hyperparameters----")
print(f"Blocks : {B} \nClones : {C} \nMutations : {M} \nDepth Range : ({x},{y}) \nZero Probability : {zero_probability} \nParams : {params}")

----Hyperparameters----
Blocks : 3 
Clones : 3 
Mutations : 8000 
Depth Range : (20,30) 
Zero Probability : 0.4 
Params : {'block1': [0.3729671820350788, 0.3664418262179638, 0.2605909917469574], 'block2': [0.5181211355254365, 0.4290283306928521, 0.052850533781711506], 'block3': [0.625300674315358, 0.07928485876706097, 0.295414466917581]}


# Input.py

In [3]:
answer_membership = []
mutation_cnt = 0

while mutation_cnt < M:
  membership = random.choice(clones)
  zero_cnt = 0
  mutation_input = dict()

  for block in blocks:
    #Depth & Alt Allocation
    if random.choice(range(10)) < zero_probability*10:
      input = [0,0]
      zero_cnt += 1
    else:
      depth = random.randrange(x,y)
      alt = int(depth*params[block][clones.index(membership)]/2)
      input = [depth, alt]

    mutation_input[block] = input
    
  #Check if there is mutation
  if zero_cnt != 3:
    for block in blocks:
      observation[block].append(mutation_input[block])
    answer_membership.append(membership)
    mutation_cnt += 1

observation_df = pd.DataFrame(observation)
observation_df['answer'] = answer_membership

# Hard Clustering.py

In [16]:
class coinEM:
  def __init__(self, obs, params, verbose = False): #p = [p_1 corresponds to prob(Head|clone1), p_2 corresponds to prob(Head|clone2), ... p_c correspondds to prob(Head|clone c)]
    self.obs = obs
    self.params = params
    self.clones = len(params)
    self.param_history = [params]
    self.verbose = verbose
    self.z_table = {}

  def Binomial(self, flips, heads, prob):
    tails = flips-heads
    return (prob**heads)*((1-prob)**(tails))

  def Normalize(self, arr):

    sum = 0
    for entry in arr:
      sum += entry
    
    if sum==0:
      return arr
    for i in range(len(arr)):
      arr[i] = arr[i]/sum
    return arr

  def EM(self, verbose):

    # E-step
    for clone in range(self.clones):
      self.z_table['clone'+str(clone+1)]= [[],[]] #[alt, depth-alt = normal]

    for mutation in self.obs:
      depth, alt = mutation #F : depth, H : alt, T : norm
      norm = depth - alt
      obs_prob = self.Normalize([self.Binomial(depth, alt, param) for param in self.params])

      for clone in range(self.clones):
        self.z_table['clone'+str(clone+1)][0].append(obs_prob[clone]*alt)
        self.z_table['clone'+str(clone+1)][1].append(obs_prob[clone]*norm)

    if verbose:
      print("-"*10)
      print("iterating observation datas")

    z_table_dict = {}

    for key, value in self.z_table.items():  
      z_table_dict[key+'_alt'] = value[0]
      z_table_dict[key+'_norm'] = value[1]

    z_table_df = pd.DataFrame(z_table_dict)
    
    print(z_table_df)
    
    # M-step
    self.params = [sum(self.z_table[key][0]) / (sum(self.z_table[key][0]) + sum(self.z_table[key][1])) for key in self.z_table.keys()]
    self.param_history.append(self.params)

    if verbose:
      print("current params : probability")
      print(self.params)


  def run(self):
    iters = 1
    while True:
      previous = self.params
      if self.verbose:
        print(f"iteration #{iters}")
      iters += 1
      self.EM(verbose = self.verbose)
      
      if sum([(previous[i] - self.params[i])**2 for i in range(self.clones)])**(1/2) < 0.0005:
        if self.verbose:
          print("-"*10)
          print("EM process is finished")
        break
    
  def soft_table(self):
    return self.z_table
        
    #print(self.params)

In [21]:
params

{'block1': [0.3729671820350788, 0.3664418262179638, 0.2605909917469574],
 'block2': [0.5181211355254365, 0.4290283306928521, 0.052850533781711506],
 'block3': [0.625300674315358, 0.07928485876706097, 0.295414466917581]}

In [None]:
class_predictions = []
for block in params.keys():
  observation =  list(observation_df[block])
  parameter = params[block]
  model = coinEM(observation, parameter, verbose = False)
  model.run()

  output = model.soft_table()
  re_output = []
  for clone in clones:
    re_output.append([sum(x) for x in zip(output[clone][0], output[clone][1])])
  mutation_class = []
  for i in range(len(re_output[0])):
    mutation_class.append(Normalize([clone_output[i] for clone_output in re_output]))

  class_predictions.append(mutation_class)

In [67]:
class_predictions = pd.DataFrame(class_predictions[:], columns=['mutation'+str(i+1) for i in range(M)])

In [88]:
pred = []
for column in class_predictions.columns:
  temp = [sum(x) for x in zip(class_predictions[column][0], class_predictions[column][1], class_predictions[column][2])] # 임시 : clone이 3개라서 0,1,2만 iterate한다.
  pred.append("clone"+str(temp.index(max(temp))+1))
pred

['clone2',
 'clone2',
 'clone1',
 'clone1',
 'clone1',
 'clone1',
 'clone3',
 'clone3',
 'clone2',
 'clone3',
 'clone1',
 'clone2',
 'clone1',
 'clone1',
 'clone1',
 'clone1',
 'clone3',
 'clone2',
 'clone2',
 'clone3',
 'clone2',
 'clone2',
 'clone2',
 'clone3',
 'clone3',
 'clone3',
 'clone1',
 'clone1',
 'clone1',
 'clone3',
 'clone1',
 'clone1',
 'clone2',
 'clone1',
 'clone1',
 'clone3',
 'clone2',
 'clone2',
 'clone3',
 'clone1',
 'clone2',
 'clone1',
 'clone2',
 'clone2',
 'clone2',
 'clone2',
 'clone1',
 'clone1',
 'clone2',
 'clone3',
 'clone1',
 'clone2',
 'clone1',
 'clone2',
 'clone3',
 'clone2',
 'clone1',
 'clone3',
 'clone2',
 'clone2',
 'clone1',
 'clone1',
 'clone2',
 'clone1',
 'clone3',
 'clone3',
 'clone1',
 'clone3',
 'clone1',
 'clone2',
 'clone1',
 'clone1',
 'clone3',
 'clone1',
 'clone1',
 'clone2',
 'clone1',
 'clone2',
 'clone1',
 'clone3',
 'clone3',
 'clone2',
 'clone1',
 'clone2',
 'clone1',
 'clone2',
 'clone3',
 'clone1',
 'clone1',
 'clone2',
 'clone3',

In [89]:
table = pd.DataFrame(columns = ["predict", "answer"])
table["predict"] = pred
table["answer"] = observation_df['answer']
print(sum(table["predict"] == table["answer"])/M)

0.9585
