In [None]:
!pip install fastText

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastText
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 5.7 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.1-py3-none-any.whl (216 kB)
Building wheels for collected packages: fastText
  Building wheel for fastText (setup.py) ... [?25l[?25hdone
  Created wheel for fastText: filename=fasttext-0.9.2-cp38-cp38-linux_x86_64.whl size=3133868 sha256=f0d8cf525b937db0dc362ba3721b368f2c9c946a466bfd8898cb62384309f718
  Stored in directory: /root/.cache/pip/wheels/93/61/2a/c54711a91c418ba06ba195b1d78ff24fcaad8592f2a694ac94
Successfully built fastText
Installing collected packages: pybind11, fastText
Successfully installed fastText-0.9.2 pybind11-2.10.1


In [None]:
from tqdm import tqdm
import csv
import pickle
import re
import random
import fasttext
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
import gensim
import argparse

In [None]:
class SemanticVec:

    def __init__(self, w2vmodel, t2wFN, embed_dim):
        # self.corpus = corpus
        self.embed_dim = embed_dim
        self.w2vmodel = w2vmodel
        self.t2wFN = t2wFN
        self.stop_words = ['a', 'the']

    def csr_todense(self, n_row, n_col, Ap, Aj, Ax):
        result = np.zeros([n_row, n_col])
        for i in range(0, n_row):
            for j in range(Ap[i], Ap[i + 1]):
                result[Aj[j]] += Ax[j]
        return result

    def run(self):
        # loadtemplate()
        # t2w = gensim.models.KeyedVectors.load_word2vec_format(self.w2vmodel, binary=False, encoding='utf8')
        t2w = pickle.load(open(self.t2wFN, 'rb'))
        # preprocess()
        # import pdb; pdb.set_trace()
        self.preprocess(t2w)
        # word2vec()
        self.word2vec()
        # calTFIDF()
        weight, vectorizer = self.calTFIDF()
        senvec = dict()
        print('Semantic Vectorization...')
        #import pdb; pdb.set_trace()
        i = 0
        for idx, wvs in tqdm(self.vecs.items()):
            # print(idx)
            # import pdb; pdb.set_trace()
            vec = np.zeros(self.embed_dim)
            w = weight[i]
            i += 1
            v = np.zeros(self.embed_dim)
            for ii, wv in enumerate(wvs):
                wi = vectorizer.vocabulary_.get(t2w[idx][ii])
                # if type(wi) is np.int64:
                if type(wi) is int:
                    ww = w[wi]
                else:
                    ww = 0
                v = v + np.array(ww * wv)
            v = v.astype('float64')
            senvec[idx] = v
            print(v.dtype)
        # print(senvec.shape)
        return senvec


    def preprocess(self, t2w):
        print('Preprocessing...')
        self.t2w_filter = dict()
        for t, words in tqdm(t2w.items()):
            print(t)
            print(words)
            self.t2w_filter[t] = [word for word in words if word not in self.stop_words]

    def word2vec(self):
        print('Word Embedding...')
        # pre-trained on Common Crawl Corpus dataset using the FastText algorithm
        model = fasttext.load_model(self.w2vmodel)
        # model = gensim.models.KeyedVectors.load_word2vec_format(self.w2vmodel, binary=False, encoding='utf8')
        self.vecs = dict()
        for idx, line in tqdm(self.t2w_filter.items()):
            vec = list()
            for word in line:
                vec.append(model[word])
            self.vecs[idx] = vec

    def calTFIDF(self):
        print('Calculating TFIDF weight...')
        vectorizer = CountVectorizer()
        corpus = list()
        for idx, lines in self.t2w_filter.items():
            corpus.append(' '.join(lines))
            # corpus = [l for idx, lines in self.t2w_filter.iterms() l = ' '.join(lines)]
        X = vectorizer.fit_transform(corpus)
        # word = vectorizer.get_feature_names()
        # import pdb; pdb.set_trace()
        transformer = TfidfTransformer()
        # print transformer
        # import ipdb; ipdb.set_trace()
        tfidf = transformer.fit_transform(X)
        M, N = tfidf._swap(tfidf.shape)
        # weight = self.csr_todense(M, N, tfidf.indptr, tfidf.indices, tfidf.data)
        weight = tfidf.toarray()
        # import ipdb; ipdb.set_trace()
        return weight, vectorizer

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
!unzip crawl-300d-2M-subword.zip

--2022-12-08 19:33:28--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5828358084 (5.4G) [application/zip]
Saving to: ‘crawl-300d-2M-subword.zip’


2022-12-08 19:43:23 (9.36 MB/s) - ‘crawl-300d-2M-subword.zip’ saved [5828358084/5828358084]

Archive:  crawl-300d-2M-subword.zip
  inflating: crawl-300d-2M-subword.vec  
  inflating: crawl-300d-2M-subword.bin  


In [None]:
# /content/sample_data/template2words.pkl
# parser = argparse.ArgumentParser()
# parser.add_argument('-ratio', default=1, type=float)
# args = parser.parse_args()
# ratio =  args.ratio
ratio = 1
w2vmodelPath = '/content/crawl-300d-2M-subword.bin'
t2wPath = '/content/sample_data/templates.pkl'
#t2wPath = '../data/hdfs_' + str(ratio) + '/my_hdfs_' + str(ratio) + '_template2words.pkl'
embed_dim = 300
model = SemanticVec(w2vmodelPath, t2wPath, embed_dim)
vecs = model.run()
pickle.dump(vecs, open('/content/sample_data/sentence2vec.pkl', 'wb'))
#pickle.dump(vecs, open('../data/hdfs_' + str(ratio) + '/sentence2vec.pkl', 'wb'))


Preprocessing...


100%|██████████| 112/112 [00:00<00:00, 29716.73it/s]

0
['instruction', 'cache', 'parity', 'error', 'corrected']
1
['<*>', 'double-hummer', 'alignment', 'exceptions']
2
['CE', 'sym', '<*>', ',', 'at', '<*>', ',', 'mask', '<*>']
3
['ciod', ':', 'failed', 'to', 'read', 'message', 'prefix', 'on', 'control', 'stream', '(CioStream', 'socket', 'to', '<*>']
4
['generating', '<*>']
5
['force', 'load/store', 'alignment...............0']
6
['ciod', ':', 'cpu', '<*>', 'at', 'treeaddr', '<*>', 'sent', 'unrecognized', 'message', '0xffffffff']
7
['ciod', ':', 'LOGIN', '<*>', 'failed', ':', 'No', 'such', 'file', 'or', 'directory']
8
['<*>', 'ddr', 'errors(s)', 'detected', 'and', 'corrected', 'on', 'rank', '<*>', ',', 'symbol', '<*>', ',', 'bit', '<*>']
9
['data', 'TLB', 'error', 'interrupt']
10
['ciod', ':', 'Message', 'code', '<*>', 'is', 'not', '<*>', 'or', '<*>']
11
['data', 'storage', 'interrupt']
12
['instruction', 'address', '<*>', '<*>']
13
['data', 'address', '<*>', '<*>']
14
['machine', 'check', '<*>', '<*>']
15
['program', 'interrupt', ':', 'i


100%|██████████| 112/112 [00:00<00:00, 4864.52it/s]


Calculating TFIDF weight...
Semantic Vectorization...


100%|██████████| 112/112 [00:00<00:00, 1731.47it/s]

float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64





In [None]:
pip install bert-serving-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-serving-client
  Downloading bert_serving_client-1.10.0-py2.py3-none-any.whl (28 kB)
Installing collected packages: bert-serving-client
Successfully installed bert-serving-client-1.10.0
