## Read input file

For testing read file method for `corpus.py`

In [1]:
from gensim import corpora
from gensim.models.tfidfmodel import TfidfModel
from gensim.models import LogEntropyModel

import re
import json
import sys

sys.path.append("../")  # Add "../" to utils folder path
from utils import globals

In [2]:
FILENAME = globals.DATA_PATH + 'output_1_2.txt'
DICTIONARY_PATH = "/home/dhuy237/thesis/code/bimetaReduce/data/R4_medium/dictionary.pkl"

In [3]:
# Test clean line method
with open(FILENAME) as f:
    content = f.readlines()
    
a = content[0]

re.sub('[null\t\n\[\]\""]', '', a).replace(' ', '').split(',')

['0',
 'AAACCCTCTTCCACGAACCCTCTTGAAAATCCCCCACATCCACAAAATAAATCAAATAAATTTCAACATTATCACCAAAAGGGTAAAAGGTTATTTAAAAAATAAAATAAATTTAAAAATTTAAATTAAATACCAAAAAAGCCAAATAACTTATTGTGATTCTTGAGCTTTCTTTAACTCTGCCTTCATATCTTGATAGACTTTAGTCCATTTTAATTTTCTTGGATTTCTTCCCATTCTGTAGCTTTTCTCACATTTGGATGAGCAGAAATATAATACAGTCCCATCTTTTTCTACGACCATTTTTCCTTTTCCTGGCTCAATTTCATAACCACAAAAGCTGCATGTTCTCCATTCTGGCATAGCTATCCCCCTTTAATAGTGTTTCAGTGATTTTAAAATAATTTAAGATTAAATTATTTATCTTCTTCTGTCTAATGGTCTTGCTTCTCTCTCTGTTTCTCTTAACATAATAATGTCTCCAACTTTAACTGGACCTTTAACGTTTCTAACTAAAACTCTTCCAGTATCTTTTCCACCTAAGATTTTACATCTAACTTGTATAATTCCTCCAGTAACCCCTGTTCTACCAATGACTTCAATAACTTCAGCAGCTACTGCTTCCTTATAAACAAATTCATCTTCCGATCCTCATCACCTAATATTAATGAAGGTTTAAAATTTATAAAAAAGTTAGTAGTAGTGTTTCATAATTTATATAATAATAACTATATACTATTGATTGATGGTTAAATAGCGTTCTAATAATTTACTGCTTCAAAACATTTACCTTTTCAATTAATACCTTTAACTCTTCAGCATCTCCTTCGTTG',
 '0',
 'AAAC',
 'AACC',
 'ACCC',
 'CCCT',
 'CCTC',
 'CTCT',
 'TCTT',
 'CTTC',
 'TTCC',
 'TCCA',
 'CCAC',
 'CACG',
 'ACGA',
 'CGAA',
 'GAAC',
 'AACC',
 'ACCC

In [4]:
documents = []

with open(FILENAME) as f:
    content = f.readlines()

for line in content:
    clean_line = re.sub('[null\t\n\[\]\"]', '', line).replace(' ', '').split(',')[3:]
    documents.append(clean_line)
    
print(documents[:2])

[['AAAC', 'AACC', 'ACCC', 'CCCT', 'CCTC', 'CTCT', 'TCTT', 'CTTC', 'TTCC', 'TCCA', 'CCAC', 'CACG', 'ACGA', 'CGAA', 'GAAC', 'AACC', 'ACCC', 'CCCT', 'CCTC', 'CTCT', 'TCTT', 'CTTG', 'TTGA', 'TGAA', 'GAAA', 'AAAA', 'AAAT', 'AATC', 'ATCC', 'TCCC', 'CCCC', 'CCCC', 'CCCA', 'CCAC', 'CACA', 'ACAT', 'CATC', 'ATCC', 'TCCA', 'CCAC', 'CACA', 'ACAA', 'CAAA', 'AAAA', 'AAAT', 'AATA', 'ATAA', 'TAAA', 'AAAT', 'AATC', 'ATCA', 'TCAA', 'CAAA', 'AAAT', 'AATA', 'ATAA', 'TAAA', 'AAAT', 'AATT', 'ATTT', 'TTTC', 'TTCA', 'TCAA', 'CAAC', 'AACA', 'ACAT', 'CATT', 'ATTA', 'TTAT', 'TATC', 'ATCA', 'TCAC', 'CACC', 'ACCA', 'CCAA', 'CAAA', 'AAAA', 'AAAG', 'AAGG', 'AGGG', 'GGGT', 'GGTA', 'GTAA', 'TAAA', 'AAAA', 'AAAG', 'AAGG', 'AGGT', 'GGTT', 'GTTA', 'TTAT', 'TATT', 'ATTT', 'TTTA', 'TTAA', 'TAAA', 'AAAA', 'AAAA', 'AAAA', 'AAAT', 'AATA', 'ATAA', 'TAAA', 'AAAA', 'AAAT', 'AATA', 'ATAA', 'TAAA', 'AAAT', 'AATT', 'ATTT', 'TTTA', 'TTAA', 'TAAA', 'AAAA', 'AAAA', 'AAAT', 'AATT', 'ATTT', 'TTTA', 'TTAA', 'TAAA', 'AAAT', 'AATT', 'ATTA'

In [5]:
def create_corpus(dictionary, documents, 
                  is_tfidf=False, 
                  smartirs=None, 
                  is_log_entropy=False, 
                  is_normalize=True):
    
    corpus = [dictionary.doc2bow(d, allow_update=True) for d in documents]
    if is_tfidf:
        tfidf = TfidfModel(corpus=corpus, smartirs=smartirs)
        corpus = tfidf[corpus]
    elif is_log_entropy:
        log_entropy_model = LogEntropyModel(corpus, normalize=is_normalize)
        corpus = log_entropy_model[corpus]
    return corpus

In [6]:
dictionary = corpora.Dictionary.load(DICTIONARY_PATH)

In [7]:
corpus = create_corpus(
            dictionary=dictionary,
            documents=documents,
            is_tfidf=globals.IS_TFIDF,
            smartirs=globals.SMARTIRS,
        )

In [8]:
corpus[:2]

[[(0, 21),
  (1, 4),
  (2, 5),
  (3, 18),
  (4, 4),
  (5, 4),
  (6, 1),
  (7, 10),
  (8, 2),
  (9, 2),
  (10, 3),
  (11, 1),
  (12, 18),
  (13, 2),
  (14, 4),
  (15, 14),
  (16, 3),
  (18, 1),
  (19, 6),
  (20, 5),
  (21, 3),
  (23, 5),
  (24, 2),
  (27, 1),
  (28, 3),
  (29, 3),
  (30, 3),
  (31, 1),
  (32, 1),
  (34, 2),
  (35, 3),
  (36, 1),
  (37, 1),
  (38, 5),
  (41, 1),
  (42, 4),
  (43, 2),
  (44, 3),
  (45, 19),
  (46, 4),
  (47, 4),
  (48, 5),
  (49, 3),
  (50, 4),
  (52, 3),
  (54, 2),
  (55, 6),
  (56, 5),
  (57, 3),
  (58, 8),
  (59, 2),
  (61, 4),
  (62, 3),
  (63, 1),
  (64, 1),
  (65, 3),
  (67, 5),
  (68, 6),
  (69, 1),
  (70, 5),
  (71, 5),
  (72, 2),
  (73, 3),
  (74, 5),
  (76, 1),
  (79, 2),
  (80, 4),
  (81, 1),
  (82, 1),
  (91, 7),
  (92, 3),
  (94, 3),
  (95, 4),
  (97, 4),
  (98, 3),
  (99, 14),
  (100, 2),
  (101, 1),
  (103, 2),
  (105, 2),
  (106, 1),
  (107, 1),
  (110, 1),
  (114, 2),
  (116, 1),
  (117, 1),
  (120, 1),
  (121, 2),
  (124, 2),
  (125, 3),

In [9]:
# Test
a = []

for i, item in enumerate(corpus):
    item = [list(elem) for elem in item]
    a.append([i, item])
print(a[0], "\n")
print(a[1])

[0, [[0, 21], [1, 4], [2, 5], [3, 18], [4, 4], [5, 4], [6, 1], [7, 10], [8, 2], [9, 2], [10, 3], [11, 1], [12, 18], [13, 2], [14, 4], [15, 14], [16, 3], [18, 1], [19, 6], [20, 5], [21, 3], [23, 5], [24, 2], [27, 1], [28, 3], [29, 3], [30, 3], [31, 1], [32, 1], [34, 2], [35, 3], [36, 1], [37, 1], [38, 5], [41, 1], [42, 4], [43, 2], [44, 3], [45, 19], [46, 4], [47, 4], [48, 5], [49, 3], [50, 4], [52, 3], [54, 2], [55, 6], [56, 5], [57, 3], [58, 8], [59, 2], [61, 4], [62, 3], [63, 1], [64, 1], [65, 3], [67, 5], [68, 6], [69, 1], [70, 5], [71, 5], [72, 2], [73, 3], [74, 5], [76, 1], [79, 2], [80, 4], [81, 1], [82, 1], [91, 7], [92, 3], [94, 3], [95, 4], [97, 4], [98, 3], [99, 14], [100, 2], [101, 1], [103, 2], [105, 2], [106, 1], [107, 1], [110, 1], [114, 2], [116, 1], [117, 1], [120, 1], [121, 2], [124, 2], [125, 3], [126, 16], [127, 2], [128, 1], [129, 8], [130, 6], [131, 8], [133, 2], [134, 1], [135, 18], [136, 6], [137, 2], [138, 1], [139, 7], [140, 2], [141, 18], [142, 4], [143, 4], [

In [10]:
result = []
for i, item in enumerate(corpus):
    item = [list(elem) for elem in item]
    result.append([i, item])

In [11]:
with open(globals.DATA_PATH+'output_1_3_2.txt', 'w+') as f:
    for item in result:
        f.write("null\t%s\n" % json.dumps(item))