# Batch SOM for Nucleotides
Kennosuke Wada & Yoshiko Wada (Nagahama Institute of Bio-Science and Technology. Shiga, Japan)

In [1]:
import os.path
import glob
import numpy as np
import time
import datetime
import random
import matplotlib.pyplot as plt

In [2]:
import seaborn as sns
sns.set_style("whitegrid")

## Setting parameters

In [3]:
row           = 8          # Number of rows in SOM
col           = 10         # Number of columns in SOM

seed          = 12345678   # Seed of random number generation

learning_step = 3000       # Number of learning

alpha_init    = 0.01       # Initial value of learning speed
range_init    = 0.5        # Initial value of neighborhood radius (50%)

## Loading the nucleotide frequency count data

In [4]:
# Frequency count file directory
dir_count     = os.path.join("fasta", "count_2_10000_10000")

# Frequency count file extension
ext_count     = [".count", ".cnt"]

# Directory for storing SOM calculation results
dt_now = datetime.datetime.now()
time_str = "{}_{:02d}_{:02d}_{:02d}_{:02d}".format(
    dt_now.year, dt_now.month, dt_now.day, dt_now.hour, dt_now.minute)
dir_som       = "som_" +  time_str
dir_output    = os.path.join("fasta", dir_som)
print(dir_output)

if not os.path.isdir(dir_output):
    os.mkdir(dir_output)

fasta\som_2021_02_13_15_29


In [5]:
nuc_names = ""

labels_filename = os.path.join(dir_count, "labels.txt")
if os.path.exists(labels_filename) and os.path.isfile(labels_filename):
    with open(labels_filename) as f:
        nuc_names = f.read().split("\t")
        print("Number of nucleotide types :", len(nuc_names))
else:
    raise Exception("Not found a file of 'labels.txt' in the directory : " + dir_count + ".")

count_files = []
filenames = os.path.join(dir_count, "*")

for file in glob.glob(filenames):
    root, ext = os.path.splitext(file)
    if ext in ext_count:
        count_files.append(file)
        print(file)

if len(count_files) == 0:
    raise Exception("Not found Count files in the directory : " + dir_count + ".")

Number of nucleotide types : 16
fasta\count_2_10000_10000\EbolaNew.cnt
fasta\count_2_10000_10000\ZikaHum.cnt


In [6]:
names  = []
target = []
data   = np.empty((0, len(nuc_names)))

for file in count_files:
    fname = os.path.basename(file)
    root, ext = os.path.splitext(fname)
    names.append(root)

print(names)

for idx, file in enumerate(count_files):
    with open(file) as f:
        lines = f.readlines()
        for line in lines:
            if line[0] in ['%', '>', '#']:
                continue
            elif not line.strip():
                continue
            else:
                target.append(idx)
                vec = line.split('\t')
                vec = [float(s) for s in vec]
                vec = np.array([vec])
                # Normalization
                #nvec = vec / np.linalg.norm(vec)    # Normalize vector length to 1
                nvec = vec / np.sum(vec)             # Normalize the sum of the components to 1
                data = np.append(data, nvec, axis=0)

n_targets = len(target)
print("n_targets     = ", n_targets)   # Number of correct labels

n_samples, n_features = data.shape
print("n_samples     = ", n_samples)   # Number of data
print("n_features    = ", n_features)  # Number of nucleotide types

['EbolaNew', 'ZikaHum']
n_targets     =  1945
n_samples     =  1945
n_features    =  16


## Starting the SOM calculation

In [7]:
start_time = time.time()

In [8]:
# Automatic pallet generation
import colorsys

num_colors = len(names)
colors = np.array([colorsys.hsv_to_rgb(h, 1.0, 1.0)
                   for h in np.linspace(0, 240/360, num_colors)])
pallets = {}
pallets[-2] = (0.0, 0.0, 0.0)  # Black : When different labels are assigned to the same cell 
pallets[-1] = (1.0, 1.0, 1.0)  # White : When unallocated
for idx, color in enumerate(colors):
    pallets[idx] = color

In [9]:
# Set the seed of random number
random.seed(seed)
np.random.seed(seed)

## Generating the initialization vector of SOM array by random numbers


In [10]:
# Generating the weight vector of SOM cell with random numbers of normal distribution
means = data.mean(axis=0)
stds  = data.std(axis=0)
weight = np.zeros((row, col, n_features))
'''
for t in range(n_features):
    for x in range(row):
        for y in range(col):
            weight[x, y, t] = np.random.normal(means[t], stds[t])
'''
for t in range(n_features):
    weight[:, :, t] = np.random.normal(means[t], stds[t], (row, col))

In [11]:
# SOM cell weight vector for batch processing
weight_delta = np.zeros((row, col, n_features))

# Number of times changed by input vector
weight_count = np.zeros((row, col))

# The label of the maximum matched input registered in the SOM cell
cell_label = np.zeros((row, col), dtype=int)

# SOM cell color
cell_color = np.zeros((row, col, 3))

In [12]:
x, y = np.meshgrid(range(row), range(col))
coordinates = np.hstack((y.flatten()[:, np.newaxis], x.flatten()[:, np.newaxis]))

In [13]:
def find_matching_cell(invec):
    min_idx = np.argmin(np.linalg.norm(weight - invec, axis=2))
    return np.unravel_index(min_idx, (row, col))

In [14]:
def change_weight(x, y, a, r, invec):
    global weight_delta, weight, weight_count
    for dx in range(-r, r + 1):
        for dy in range(-r, r + 1):
            try:
                dist = dx**2 + dy**2 + 1
                weight_delta[x + dx, y + dy] += a * (invec - weight[x + dx, y + dy]) / dist
                weight_count[x + dx, y + dy] += 1.0 / dist
            except:
                pass

In [15]:
def create_image(cell_color, step):
    plt.axis("off")
    image = plt.imshow(cell_color.reshape(row, col, 3), interpolation='none')
    plt.title("{0:04d}".format(step), fontsize=18, fontweight='bold')
    
    filename = '{0:04d}'.format(step)
    fname = os.path.join(dir_output, filename)
    plt.savefig(fname)
    
    return image

In [16]:
rangeF = range_init
alpha  = alpha_init

half_step = learning_step / 2

In [17]:
print(datetime.datetime.now())

2021-02-13 15:29:59.351526


In [None]:
fig = plt.figure()
image_list = []

for step in range(learning_step + 1):
    cell_label.fill(-1)                   # Initialize labels for all cells at the beginning of each learning step
    weight_delta.fill(0.0)                # Initialize weight vector increments
    
    radius = max(round(rangeF * row), 0)
    
    for i in range(n_samples):
        inputVec = data[i]
        label = target[i]
        
        x, y = find_matching_cell(inputVec)  # X-axis is the vertical axis and Y-axis is the horizontal axis
        
        if cell_label[x, y] != -2:           # If different labels are not assigned to the same cell
            if cell_label[x, y] == -1  :     # When in unassigned state
                cell_label[x, y] = label
            else:
                if cell_label[x, y] != label:
                    cell_label[x, y] = -2    # When different labels are assigned to the same cell

        change_weight(x, y, alpha, radius, inputVec)
    
    if step % 100 == 0:
        cell_color.fill(0)
        
        for x in range(row):
            for y in range(col):
                cell_color[x, y] = pallets[cell_label[x, y]]
        
        image = create_image(cell_color, step)
        image_list.append([image])

        print("Time = {} : Step = {:04d}, Radius = {}, alpha = {}".format(
            datetime.datetime.now(), step, radius, alpha))

    #weight_delta /= weight_count[:, :, np.newaxis]
    weight += weight_delta

    rate = 2 ** (-step / half_step)
    rangeF = range_init * rate
    alpha  = alpha_init * rate

end_time = time.time()
print("\nElapsed Time: {0} [sec]".format(end_time - start_time))

plt.show()

Time = 2021-02-13 15:30:01.055240 : Step = 0000, Radius = 4, alpha = 0.01
Time = 2021-02-13 15:32:36.390878 : Step = 0100, Radius = 4, alpha = 0.009552829363824374
Time = 2021-02-13 15:35:07.917723 : Step = 0200, Radius = 4, alpha = 0.00912143891163659
Time = 2021-02-13 15:37:34.662099 : Step = 0300, Radius = 3, alpha = 0.008709529360357965
Time = 2021-02-13 15:39:17.134758 : Step = 0400, Radius = 3, alpha = 0.008316220983749064
Time = 2021-02-13 15:41:00.544371 : Step = 0500, Radius = 3, alpha = 0.007940673782596419
Time = 2021-02-13 15:42:43.457130 : Step = 0600, Radius = 3, alpha = 0.007582085690703759
Time = 2021-02-13 15:44:24.619956 : Step = 0700, Radius = 3, alpha = 0.007239690861897795
Time = 2021-02-13 15:46:07.954200 : Step = 0800, Radius = 3, alpha = 0.0069127580343900765
Time = 2021-02-13 15:47:54.472063 : Step = 0900, Radius = 3, alpha = 0.006600588969001639
Time = 2021-02-13 15:49:39.150375 : Step = 1000, Radius = 3, alpha = 0.006302516957914928
Time = 2021-02-13 15:50:42