In [1]:
# Folder with data
binfilesdir = './pyRenormalize_Lenta/'

# Principle of merging is represented by a number 1-3. Short description:
# 1: Merging of random columns
# 2: Merging of columns with minimum Kullback-Leibler divergence
# 3. Merging of coumns with minima local Renyi entropy
col_rem_alg = 2

# Minimal number of topics (larger of equal to 2)
min_topics = 2

In [2]:
import re
import struct
import multiprocessing
import sys
import os
import csv
import random
import math
import numpy as np
import pandas as pd
import itertools as it
import tqdm

In [3]:
class CFGStreamData(object):
    def __init__(self):
        self._fgcText = 1
        self._fgcInt = 2
        self._fgcFloat = 3
        self._fgcDouble = 4
        self._fname = ''
        self._datafile = None
        self._lockdatafile = multiprocessing.Lock()
        self._columnlist = {}
        self._colcount = 0
        self._rowcount = 0
        self._recsize = 0

    def __del__(self):
        self.CloseDataFile()

    def ColCount(self):
        return self._colcount

    def RowCount(self):
        return self._rowcount

    def OpenDataFile(self, filename, readonly=True):
        r = False
        try:
            self.CloseDataFile()
            if readonly:
                self._datafile = open(filename, "rb+")
            else:
                self._datafile = open(filename, "wb")
            self._fname = filename
            # Read file header
            fgdataheader_struct = '=iii'
            self._fgdataheader_size = struct.calcsize(fgdataheader_struct)
            self._fgdataheader = self._datafile.read(self._fgdataheader_size)
            self._colcount, self._rowcount, self._recsize = struct.unpack(fgdataheader_struct, self._fgdataheader)
            if self._recsize <= 0:
                FGStreamDataColProps_struct = '=BiI65p65p'
                self._FGStreamDataColProps_size = struct.calcsize(FGStreamDataColProps_struct)
                colsizes = 0
                for i in range(self._colcount):
                    colprops = self._datafile.read(self._FGStreamDataColProps_size)
                    coltype, colsize, colflags, colname, colcapt = struct.unpack(FGStreamDataColProps_struct, colprops)
                    # calculate the shifts of the column beginnings in order to speed up method GetOffset()
                    coloffs = self._fgdataheader_size + self._FGStreamDataColProps_size * self._colcount + colsizes * self._rowcount
                    colsizes += colsize
                    self._columnlist[i] = [coltype, colsize, colflags, colname, colcapt, coloffs]
            r = True
        except Exception:
            r = False
            self.CloseDataFile()
        return r

    def CloseDataFile(self):
        r = False
        if self._datafile != None:
            try:
                self._lockdatafile.acquire()
                self._datafile.close()
                r = True
            finally:
                self._fname = ''
                self._datafile = None
                self._columnlist = {}
                self._lockdatafile.release()
                self._colcount = 0
                self._rowcount = 0
                self._recsize = 0
        return r

    def GetOffset(self, ACol, ARow):
        r = -1
        if not ((ACol >= 0) and (ACol < self._colcount) and (ARow >= 0) and (ARow < self._rowcount)): return r
        if self._recsize > 0:
            r = self._fgdataheader_size + self._recsize * (ACol * self._rowcount + ARow)
        else:
            if self._recsize == 0:
                # хранение данных по столбцам (старый вариант)
                # colsizes = 0
                # for i in range(ACol):
                #     colsizes += self._columnlist[i][1]  # colsize
                # r1 = self._fgdataheader_size + self._FGStreamDataColProps_size * self._colcount + colsizes * self._rowcount
                # r1 += ARow * self._columnlist[ACol][1]  # ColSize
                r = self._columnlist[ACol][5] + ARow * self._columnlist[ACol][1]
            else:
                 #data storage in rows
                colsizes = 0
                for colprops in self._columnlist:
                    colsizes += colprops[1]  # colsize
                r = self._fgdataheader_size + self._FGStreamDataColProps_size * self._colcount + colsizes * ARow
                colsizes = 0
                for i in range(ACol):
                    colsizes += self._columnlist[i][1]  # colsize
                r += colsizes
        return r

    def GetColumnAsInt(self, ACol):  # var data: TIntColumn
        r = []
        try:
            self._lockdatafile.acquire()
            offs = self.GetOffset(ACol, 0)
            if offs < 0: raise Exception
            #check that the data type in the column and the data being recorded match
            if (self._recsize > 0) or (self._columnlist[ACol][0] != self._fgcInt): raise Exception
            if self._recsize == 0:
                self._datafile.seek(offs, 0)
                r = list(struct.unpack(str(self._rowcount) + 'i', self._datafile.read(self._rowcount * 4)))
            else:
                colsizes = 0
                for colprops in self._columnlist:
                    colsizes += colprops[1]  # colsize
                for i in range(self._rowcount):
                    self._datafile.seek(offs, 0)
                    offs += colsizes
                    r.append(struct.unpack('i', self._datafile.read(4)))
        except Exception:
            r = []
        finally:
            self._lockdatafile.release()
        return r

    def GetColumnAsFloat(self, ACol):  # var data: TIntColumn
        r = []
        try:
            self._lockdatafile.acquire()
            offs = self.GetOffset(ACol, 0)
            if offs < 0: raise Exception
            #check that the data type in the column and the data being recorded match
            if (self._recsize > 0) or (self._columnlist[ACol][0] != self._fgcFloat): raise Exception
            if self._recsize == 0:
                self._datafile.seek(offs, 0)
                r = list(struct.unpack(str(self._rowcount) + 'f', self._datafile.read(self._rowcount * 4)))
            else:
                colsizes = 0
                for colprops in self._columnlist:
                    colsizes += colprops[1]  # colsize
                for i in range(self._rowcount):
                    self._datafile.seek(offs, 0)
                    offs += colsizes
                    fv, = struct.unpack('f', self._datafile.read(4))
                    r.append(fv)
        except Exception:
            r = []
        finally:
            self._lockdatafile.release()
        return r

    def GetColumnAsText(self, ACol):  # var data: TIntColumn
        r = []
        try:
            self._lockdatafile.acquire()
            offs = self.GetOffset(ACol, 0)
            if offs < 0: raise Exception
            #check that the data type in the column and the data being recorded match
            if (self._recsize > 0) or (self._columnlist[ACol][0] != self._fgcText): raise Exception
            delNull = re.compile("\x00")
            if self._recsize == 0:
                self._datafile.seek(offs, 0)
                for rn in range(self._rowcount):
                    w, = struct.unpack('=' + str(self._columnlist[ACol][1]) + 's',
                                       self._datafile.read(self._columnlist[ACol][1]))
                    w = w.decode('cp1251')
                    #delete symbols with code 0
                    w = delNull.sub('', w)
                    r.append(w)
            else:
                colsizes = 0
                for colprops in self._columnlist:
                    colsizes += colprops[1]  # colsize
                for i in range(self._rowcount):
                    self._datafile.seek(offs, 0)
                    offs += colsizes
                    w, = struct.unpack('=' + str(self._columnlist[ACol][1]) + 's',
                                       self._datafile.read(self._columnlist[ACol][1]))
                    w = w.decode('cp1251')
                    #delete symbols with code 0
                    w = delNull.sub('', w)
                    r.append(w)
        except Exception:
            r = []
        finally:
            self._lockdatafile.release()
        return r

    def GetDataAsFloat(self, ACol, ARow):
        r = 0
        try:
            # self._lockdatafile.acquire()
            offs = self.GetOffset(ACol, ARow)
            if offs < 0: raise Exception
            # #check that the data type in the column and the data being recorded match
            # if (self._recsize > 0) or (self._columnlist[ACol][0] != self._fgcFloat): raise Exception
            self._datafile.seek(offs, 0)
            r, = struct.unpack('f', self._datafile.read(4))
        except Exception:
            r = 0
            # finally:
            # self._lockdatafile.release()
        return r

In [4]:
#Calculation of matrix Phi
def CalcPhiMatrix(nw, nwsum, beta):

    V, K = nw.shape
    phi = np.arange(V * K, dtype=np.float).reshape(K, V)
    phi = ((nw + beta) / (nwsum + V * beta)).T
    return phi

#1. Returns two random indexes of columns for merging
def GetIndexesRandom(topics):
    k1 = random.randint(0, topics-1)
    k2 = k1
    while k2 == k1:
        k2 = random.randint(0, topics-1)
    #k1=1
    #k2=2
    return k1, k2

# 2. Seraching for the minimum Kullback-Leibler divergence between topic pairs and deremining their indexes
def GetIndexesKLBMin(phi):
    
    K, V = phi.shape
    Klbs = []
  
    c1 = np.triu_indices(K)[0]
    c2 = np.triu_indices(K)[1] + 1
    c1 = c1[c2 != K]
    c2 = c2[c2 != K]
    Klb = (phi[c1] * np.log(phi[c1] / phi[c2]) + phi[c2] * np.log(phi[c2] / phi[c1])).sum(axis=1)
    Klbs = np.column_stack((Klb, c1, c2))
    Klbs_sorted = Klbs[np.argsort(Klbs[:, 0])]
    
    return int(Klbs_sorted[0, 1]), int(Klbs_sorted[0, 2])

# 3. Searching for the two minima of local Renyi entropy and determining the indexes of corresponding topics
def GetIndexesRenyiMin(phi):
    K, V = phi.shape
    thrsld = 1 / V    

    sumprob = phi[(phi > thrsld)].sum(axis=1) / K
    word_ratio = (phi > thrsld).sum(axis=1)
    
    energy = -np.log(sumprob)
    #Gibbs-Shannon entropy:
    word_ratio = word_ratio/(V * K)
    entropy_gs = np.log(word_ratio)  
     # free energy:
    free_energy = energy - topics * entropy_gs
    # Renyi entropy:
    entropy_Renyi = free_energy / (topics - 1)
    Renyis = np.column_stack((entropy_Renyi, np.arange(K)))
    Renyis_sorted = Renyis[np.argsort(Renyis[:, 0])[::-1]]

    return tuple(Renyis_sorted[0].tolist()) #Renyis_sorted[0][1], Renyis_sorted[1][1]

#Recalculation of all quantities based on Phi matrix
def CalcAllParameters(phi):
    K, V = phi.shape
    thrsld = 1.0 / V
    sumprob = phi[phi > thrsld].sum()
    word_ratio = (phi > thrsld).sum()

    sumprob = sumprob / K
    word_ratio = word_ratio / (V * K)
    energy = -math.log(sumprob)
    #Gibbs-Shannon entropy:
    entropy_gs = math.log(word_ratio)  
    # free energy:
    free_energy = energy - K * entropy_gs
    # Renyi entropy:
    entropy_Renyi = free_energy / (K - 1)
    return word_ratio, sumprob, energy, entropy_gs, free_energy, entropy_Renyi

In [None]:
print('Renormalize Gibbs binary output data (NW and NWSUM)')
random.seed()

# input parameter analysis
if col_rem_alg == 0:
    col_rem_alg = int(input('1. Enter columns remove method (1, 2 or 3): '))
else:
    print('Columns remove method:', col_rem_alg)
if min_topics == 0:
    min_topics = int(input('2. Enter minimum topics count (>=2): '))
    if min_topics < 2:
        min_topics = 2
else:
    if min_topics < 2:
        min_topics = 2
    print('Minimum topics count:', min_topics)
#
nw_nwsum_files = []
# searching files
allfiles = os.listdir(binfilesdir)
binfiles = list(filter(lambda x: x.endswith('.bin'), allfiles))
for fn in binfiles:
    ia = fn.rfind('_a')
    if ia < 0:
        continue

    alpha_s = fn[ia + 2:ia + 2 + 5]
    ib = fn.rfind('_b')

    if ib < 0:
        continue

    beta_s = fn[ib + 2:ib + 2 + 5]
    it = fn.rfind('_nw_')
    if it >= 0:
        topics_s = fn[it + 4:it + 4 + 3]
        k = topics_s.rfind('_')
        if k > 0:
            topics_s = topics_s[0:k]
        ff = 0
        for fn2 in range(len(nw_nwsum_files)):
            if nw_nwsum_files[fn2][2] == topics_s and nw_nwsum_files[fn2][3] == alpha_s and nw_nwsum_files[fn2][4] == beta_s:
                nw_nwsum_files[fn2][0] = binfilesdir + '/' + fn
                ff = 1
                break
        if ff == 0:
            nw_nwsum_files.append(
                [binfilesdir + '/' + fn, '', topics_s, alpha_s, beta_s])
    it = fn.rfind('_nwsum_')
    if it >= 0:
        topics_s = fn[it + 7:it + 7 + 3]
        k = topics_s.rfind('_')
        if k > 0:
            topics_s = topics_s[0:k]
        ff = 0
        for fn2 in range(len(nw_nwsum_files)):
            if nw_nwsum_files[fn2][2] == topics_s and nw_nwsum_files[fn2][3] == alpha_s and nw_nwsum_files[fn2][4] == beta_s:
                nw_nwsum_files[fn2][1] = binfilesdir + '/' + fn
                ff = 1
                break
        if ff == 0:
            nw_nwsum_files.append(
                ['', binfilesdir + '/' + fn, topics_s, alpha_s, beta_s])
# files are found and grouped in pairs
print('Found File Pairs: ', len(nw_nwsum_files))
# load data from files and run renormalization
delComa = re.compile(",")
delPoint = re.compile(r"\.")

for fpair in nw_nwsum_files:
    progress = 0
    fn_nw = fpair[0]
    fn_nwsum = fpair[1]
    vals = fpair[3]
    vals = delComa.sub('.', vals)
    fn_alpha = float(vals)
    vals = fpair[4]
    vals = delComa.sub('.', vals)
    fn_beta = float(vals)
    print('Input Files: ')
    print('    ', fn_nw)
    print('    ', fn_nwsum)
    binfile_nw = CFGStreamData()
    binfile_nw.OpenDataFile(fn_nw)
    binfile_nwsum = CFGStreamData()
    binfile_nwsum.OpenDataFile(fn_nwsum)
    topics = binfile_nw.ColCount()
    words = binfile_nw.RowCount()
    # column (topic) match check
    if topics != binfile_nwsum.ColCount():
        print('    Warning! Files ', fn_nw, 'and', fn_nwsum,
              'have different columns (i.e. topics) count. Missing.')
        continue
    print('     Start topics count: ', topics)
    print('     Words count: ', words)
    # create arrays and read data from files
    
    nw = np.arange(words * topics, dtype=np.int).reshape(words, topics)
    nwsum = np.arange(topics, dtype=np.int)
    
    for k in range(topics):
        d = binfile_nw.GetColumnAsInt(k)
        nw[:, k] = d
    for k in range(topics):
        d = binfile_nwsum.GetColumnAsInt(k)
        nwsum[k] = d[0]
    del binfile_nw
    del binfile_nwsum

    #data is loaded into arrays
    # create an output file
    nwi = fn_nw.rfind('_nw_')
    ai = fn_nw.rfind('_a')
    outcsvfn = fn_nw[0:nwi] + \
        fn_nw[ai:len(fn_nw) - 4] + '_rm' + str(col_rem_alg) + '.csv'
    ugof = open(outcsvfn, 'w', newline='')  # , encoding='utf_8_sig')
    # , quoting=csv.QUOTE_ALL, doublequote=True, quotechar='"')
    writer = csv.writer(ugof, delimiter=';')
    writer.writerow(['Topics',
                     'Alpha',
                     'Beta',
                     'Word Ratio',
                     'Sumprob',
                     'Energy',
                     'Shannon Entropy',
                     'Free Energy',
                     'Renyi Entropy'])

    #calculate quantities for the initial number of topics (for the original topic solution)
    phi = CalcPhiMatrix(nw, nwsum, fn_beta)
    word_ratio, sumprob, energy, entropy_gs, free_energy, entropy_Renyi = CalcAllParameters(
        phi)
    curr_topics = topics
    #write data in the the output csv file 
    writer.writerow(
        [
            curr_topics, delPoint.sub(
                ',', str(fn_alpha)), delPoint.sub(
                ',', str(fn_beta)), delPoint.sub(
                    ',', str(word_ratio)), delPoint.sub(
                        ',', str(sumprob)), delPoint.sub(
                            ',', str(energy)), delPoint.sub(
                                ',', str(entropy_gs)), delPoint.sub(
                                    ',', str(free_energy)), delPoint.sub(
                                        ',', str(entropy_Renyi))])

    #delete columns in a cycle (one per iteration) and recalculate all quantities
    if min_topics >= topics:
        continue

    for k in tqdm.tnrange(topics - min_topics):

        k1 = 0
        k2 = 0

        if col_rem_alg == 1:
            V, K = nw.shape
            k1, k2 = GetIndexesRandom(K)
        
        if col_rem_alg == 2:
            phi = CalcPhiMatrix(nw,
                                nwsum,
                                fn_beta)

            k1, k2 = GetIndexesKLBMin(phi)
            
        if col_rem_alg == 3:
            phi = CalcPhiMatrix(nw,
                                nwsum,
                                fn_beta)
            
            k1, k2 = GetIndexesRenyiMin(phi)

        #merging data from columns k1 и k2 from nw array
        nw[:, k1] += nw[:, k2]
        #merging elements of  k1 и k2 from nwsum array
        nwsum[k1] = nwsum[k1] + nwsum[k2]
        #remove column k2 from nw
        nw = np.delete(nw, k2, 1)
        #remove element k2 from nwsum
        nwsum = np.delete(nwsum, k2)
        
        #recalculation of matrix Phi
        phi = CalcPhiMatrix(nw, nwsum, fn_beta)
        
        #recalculation of all quantities:
        word_ratio, sumprob, energy, entropy_gs, free_energy, entropy_Renyi = CalcAllParameters(phi)
        
        #write data in the output csv file
        curr_topics = curr_topics - 1
        writer.writerow(
            [
                curr_topics, delPoint.sub(
                    ',', str(fn_alpha)), delPoint.sub(
                    ',', str(fn_beta)), delPoint.sub(
                    ',', str(word_ratio)), delPoint.sub(
                        ',', str(sumprob)), delPoint.sub(
                            ',', str(energy)), delPoint.sub(
                                ',', str(entropy_gs)), delPoint.sub(
                                    ',', str(free_energy)), delPoint.sub(
                                        ',', str(entropy_Renyi))])

    # Close output file
    ugof.close()

Renormalize Gibbs binary output data (NW and NWSUM)
Columns remove method: 2
Minimum topics count: 2
Found File Pairs:  1
Input Files: 
     ./pyRenormalize//20topicsnews_full_nw_100_a0,100_b0,100.bin
     ./pyRenormalize//20topicsnews_full_nwsum_100_a0,100_b0,100.bin
     Start topics count:  100
     Words count:  50948


HBox(children=(IntProgress(value=0, max=98), HTML(value='')))