# LDA
John Rodriguez, 2020-02-27

In [1]:
logan_path = '../../../'
import sys
sys.path.append(logan_path)
from parlogan.obs import OBs_between
from parlogan.info import error_counting
from parlogan.db.es import EmptyElasticQuery
from parlogan.color import paranal
import pandas as pd

In [41]:
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaMulticore
from sklearn.model_selection import train_test_split
from numpy.random import seed
seed(2020)

## Getting the OBs

In [36]:
end = pd.to_datetime('2020-02-24 00:00')
start = end - pd.Timedelta('15 day')
instr = 'GRAVITY'
OBs = OBs_between(instr, start, end)

In [37]:
OBs

Unnamed: 0,pauses,OBS.NAME,OBS.ID,ACK ABORT,Aborted,START,END,Seconds,End_with_error
0,0,M05_HD18071_MED_SPLIT,200453530,False,True,2020-02-09 00:11:11.199000+00:00,2020-02-09 00:12:29.870000+00:00,43,True
1,0,M05_HD18071_MED_SPLIT,200453530,False,True,2020-02-09 00:13:35.005000+00:00,2020-02-09 00:13:46.716000+00:00,4,True
2,0,M05_HD18071_MED_SPLIT,200453530,False,False,2020-02-09 00:23:48.533000+00:00,2020-02-09 00:46:42.482000+00:00,1374,False
3,0,M05_HD28625_MED_SPLIT,200454133,False,False,2020-02-09 00:52:23.640000+00:00,2020-02-09 01:05:07.438000+00:00,764,False
4,0,B02_HD62902_MED_SPLIT,200452405,True,True,2020-02-09 02:22:34.039000+00:00,2020-02-09 02:51:11.212000+00:00,1717,False
...,...,...,...,...,...,...,...,...,...
220,0,Calibration,-1,False,True,2020-02-23 21:06:03.834000+00:00,2020-02-23 21:06:41.436000+00:00,29,True
221,0,Calibration,-1,False,True,2020-02-23 21:08:25.213000+00:00,2020-02-23 21:08:30.847000+00:00,4,True
222,0,Calibration,-1,False,True,2020-02-23 21:18:17.111000+00:00,2020-02-23 21:19:37.022000+00:00,70,True
223,0,Calibration,-1,False,True,2020-02-23 21:27:59.139000+00:00,2020-02-23 21:29:59.235000+00:00,41,True


## Getting the error documents

In [38]:
docs = []
for index, row in OBs.iterrows():
    try:
        errors = error_counting(
            system = instr,
            start = row['START'],
            end = row['END'],
            group_by_time = False,
            group_keys = ['errkey']
        )
        errors_document = ' '.join(errors['errkey'].to_list())
        docs.append(errors_document)
    except EmptyElasticQuery:
        continue
    
documents = pd.DataFrame(docs, columns = ['errors'])

In [40]:
documents.shape

(214, 1)

In [67]:
processed_docs = documents['errors'].str.split()
docs_train, docs_test = train_test_split(processed_docs, test_size=0.2)
docs_train.reset_index(drop = True, inplace = True)
docs_test.reset_index(drop = True, inplace = True)

In [68]:
docs_train

0      [ccsERR_DB_INV_NAME, eccsERR_INFOATTR, eccsERR...
1      [ccsERR_DB_INV_NAME, ccsERR_MSG_TIMEOUT, eccsE...
2      [ccsERR_DB_INV_NAME, ccsERR_MSG_TIMEOUT, eccsE...
3      [ccsERR_DB_INV_NAME, ccsERR_DB_QUALITY, eccsER...
4      [ccsERR_DB_INV_NAME, eccsERR_INFOATTR, eccsERR...
                             ...                        
166    [evhERR_CMD_ERR_REPLY, gvmonERR_CMD_EXEC, gvmo...
167       [ic0lcuERR_SEM_CMD_LOCKED, lcctooERR_SEM_TAKE]
168    [ccsERR_DB_INV_NAME, eccsERR_INFOATTR, eccsERR...
169    [ccsERR_DB_INV_NAME, eccsERR_INFOATTR, eccsERR...
170    [bossERR_INVALID_STATE, bossERR_SETUP_FAILURE,...
Name: errors, Length: 171, dtype: object

## Dictionary

In [69]:
dictionary = Dictionary(processed_docs)

In [70]:
for k, v in dictionary.iteritems():
    print(k, v)

0 bossERR_INVALID_STATE
1 bossERR_SETUP_FAILURE
2 ccsERR_DB_INV_NAME
3 eccsERR_INFOATTR
4 eccsERR_NOASSIGN
5 eccsERR_READATTR
6 eccsERR_WRITEATTR
7 evhERR_CMD_ERR_REPLY
8 evhERR_ERROR_HANDLING
9 ic0fbERR_GEN
10 ic0lcuERR_SEM_CMD_LOCKED
11 issprsERR_COMMAND_FAILED
12 issprsERR_NOTALLOWED_IN_STATE
13 issprsERR_WAIT_READY
14 lccERR_INV_NAME
15 lcctooERR_SEM_TAKE
16 ccsERR_CMD_PARAM_RANGEVAL
17 eccsERR_MSG
18 seqERR_ADD
19 seqERR_NO_EVT_HANDLE
20 bossERR_EXP_EVT
21 bossERR_GENERAL
22 bossERR_PREP_ARCH
23 ccsERR_DB_LCU
24 ccsERR_DB_QUALITY
25 ccsERR_MSG_TIMEOUT
26 ccsERR_REMOTE_LINK
27 ccsERR_SCAN_MSG_SEND
28 gvoERR_ADD_FITS_KEYW
29 gvoERR_FILE_DOES_NOT_EXIST
30 lccERR_ACK
31 lccERR_FULL
32 lccERR_INTERNAL
33 seqERR_DB_READ_SYMBOLIC
34 ccsERR_PROC_INFO
35 gvacqERR_READ_DBLIST
36 gvdlERR_LCU_REPLY
37 bossERR_EXP_ABORTED
38 ipclERR_SEM_TIMEOUT
39 ipclERR_SEM_WAIT
40 seqERR_REPLY_TIMEOUT
41 gvacqERR_RTDCORE_LIB
42 gvttpERR_CONF_MAP
43 dlabmonERR_TCPCLIENT_CONNECT
44 gvmonERR_CMD_EXEC
45 gvmonE

In [12]:
#dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

## Corpus

In [71]:
bow_corpus = [dictionary.doc2bow(doc) for doc in docs_train]

In [72]:
bow_corpus

[[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (9, 1), (14, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (14, 1), (25, 1), (40, 1)],
 [(2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (10, 1),
  (15, 1),
  (25, 1),
  (38, 1),
  (39, 1),
  (53, 1),
  (58, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (8, 1), (14, 1), (24, 1), (36, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (53, 1)],
 [(9, 1), (10, 1), (15, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (10, 1), (14, 1), (15, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (10, 1), (14, 1), (15, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (10, 1), (14, 1), (15, 1), (40, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (25, 1), (53, 1)],
 [(10, 1), (15, 1), (34, 1), (35, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (14, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (14, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (9, 1), (14, 1)],
 [(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8

In [73]:
tfidf = TfidfModel(bow_corpus)

In [74]:
corpus_tfidf = tfidf[bow_corpus]

## LDA using Bag of words

### Training

In [86]:
lda_model = LdaMulticore(bow_corpus, num_topics = 5, id2word = dictionary, passes=2, workers=2)

In [87]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}/n'.format(idx, topic))

Topic: 0 
Words: 0.174*"lcctooERR_SEM_TAKE" + 0.174*"ic0lcuERR_SEM_CMD_LOCKED" + 0.062*"eccsERR_READATTR" + 0.062*"eccsERR_WRITEATTR" + 0.062*"eccsERR_NOASSIGN" + 0.061*"ccsERR_DB_INV_NAME" + 0.061*"eccsERR_INFOATTR" + 0.041*"lccERR_NOT_FOUND" + 0.034*"seqERR_ADD" + 0.031*"evhERR_ERROR_HANDLING"/n
Topic: 1 
Words: 0.093*"ic0fbERR_GEN" + 0.066*"ic0lcuERR_SEM_CMD_LOCKED" + 0.065*"lcctooERR_SEM_TAKE" + 0.061*"evhERR_CMD_ERR_REPLY" + 0.053*"bossERR_INVALID_STATE" + 0.052*"bossERR_SETUP_FAILURE" + 0.047*"ccsERR_PROC_INFO" + 0.042*"lccERR_INV_NAME" + 0.042*"eccsERR_WRITEATTR" + 0.042*"ccsERR_DB_INV_NAME"/n
Topic: 2 
Words: 0.125*"eccsERR_NOASSIGN" + 0.125*"eccsERR_INFOATTR" + 0.125*"ccsERR_DB_INV_NAME" + 0.124*"eccsERR_READATTR" + 0.124*"eccsERR_WRITEATTR" + 0.115*"lccERR_INV_NAME" + 0.053*"gvdlERR_LCU_REPLY" + 0.041*"lcctooERR_SEM_TAKE" + 0.041*"ic0lcuERR_SEM_CMD_LOCKED" + 0.022*"seqERR_REPLY_TIMEOUT"/n
Topic: 3 
Words: 0.082*"ipclERR_SEM_TIMEOUT" + 0.081*"ipclERR_SEM_WAIT" + 0.069*"ccsERR_

### Evaluation

In [88]:
doc_to_eval = 10
print(docs_train[doc_to_eval])
for index, score in sorted(lda_model[bow_corpus[doc_to_eval]], key=lambda tup: -1*tup[1]):
    print(f"\nScore: {score}\t \nTopic: {lda_model.print_topic(index, 10)}")

['ccsERR_PROC_INFO', 'gvacqERR_READ_DBLIST', 'ic0lcuERR_SEM_CMD_LOCKED', 'lcctooERR_SEM_TAKE']

Score: 0.8372372984886169	 
Topic: 0.093*"ic0fbERR_GEN" + 0.066*"ic0lcuERR_SEM_CMD_LOCKED" + 0.065*"lcctooERR_SEM_TAKE" + 0.061*"evhERR_CMD_ERR_REPLY" + 0.053*"bossERR_INVALID_STATE" + 0.052*"bossERR_SETUP_FAILURE" + 0.047*"ccsERR_PROC_INFO" + 0.042*"lccERR_INV_NAME" + 0.042*"eccsERR_WRITEATTR" + 0.042*"ccsERR_DB_INV_NAME"

Score: 0.04193763807415962	 
Topic: 0.174*"lcctooERR_SEM_TAKE" + 0.174*"ic0lcuERR_SEM_CMD_LOCKED" + 0.062*"eccsERR_READATTR" + 0.062*"eccsERR_WRITEATTR" + 0.062*"eccsERR_NOASSIGN" + 0.061*"ccsERR_DB_INV_NAME" + 0.061*"eccsERR_INFOATTR" + 0.041*"lccERR_NOT_FOUND" + 0.034*"seqERR_ADD" + 0.031*"evhERR_ERROR_HANDLING"

Score: 0.040440674871206284	 
Topic: 0.082*"ipclERR_SEM_TIMEOUT" + 0.081*"ipclERR_SEM_WAIT" + 0.069*"ccsERR_DB_INV_NAME" + 0.069*"eccsERR_INFOATTR" + 0.066*"eccsERR_READATTR" + 0.066*"eccsERR_WRITEATTR" + 0.066*"eccsERR_NOASSIGN" + 0.064*"ccsERR_MSG_TIMEOUT" + 

### Testing

In [89]:
doc_to_test = 10
bow_vector = dictionary.doc2bow(docs_test[doc_to_test])
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}\n".format(score, lda_model.print_topic(index, 5)))

Score: 0.5622594952583313	 Topic: 0.082*"ipclERR_SEM_TIMEOUT" + 0.081*"ipclERR_SEM_WAIT" + 0.069*"ccsERR_DB_INV_NAME" + 0.069*"eccsERR_INFOATTR" + 0.066*"eccsERR_READATTR"

Score: 0.41174831986427307	 Topic: 0.129*"lccERR_INV_NAME" + 0.127*"eccsERR_WRITEATTR" + 0.127*"eccsERR_READATTR" + 0.127*"eccsERR_INFOATTR" + 0.127*"eccsERR_NOASSIGN"



## LDA using TF-IDF

### Training

In [79]:
lda_model_tfidf = LdaMulticore(corpus_tfidf, num_topics=5, id2word = dictionary, passes=2, workers=4)

In [81]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}\n'.format(idx, topic))

Topic: 0 Word: 0.081*"bossERR_INVALID_STATE" + 0.081*"bossERR_SETUP_FAILURE" + 0.067*"issprsERR_COMMAND_FAILED" + 0.045*"evhERR_CMD_ERR_REPLY" + 0.033*"issprsERR_WAIT_READY" + 0.032*"ic0senERR_COOLING" + 0.032*"ic0senERR_COM_TIMEOUT" + 0.030*"lcctooERR_SEM_TAKE" + 0.030*"ic0lcuERR_SEM_CMD_LOCKED" + 0.029*"ic0fbERR_GEN"

Topic: 1 Word: 0.119*"lccERR_INV_NAME" + 0.096*"eccsERR_INFOATTR" + 0.096*"eccsERR_WRITEATTR" + 0.096*"ccsERR_DB_INV_NAME" + 0.096*"eccsERR_READATTR" + 0.095*"eccsERR_NOASSIGN" + 0.063*"ccsERR_MSG_TIMEOUT" + 0.057*"seqERR_REPLY_TIMEOUT" + 0.040*"ipclERR_SEM_WAIT" + 0.040*"ipclERR_SEM_TIMEOUT"

Topic: 2 Word: 0.207*"ic0lcuERR_SEM_CMD_LOCKED" + 0.207*"lcctooERR_SEM_TAKE" + 0.054*"lccERR_NOT_FOUND" + 0.051*"gvdlERR_LCU_REPLY" + 0.045*"lccERR_INV_NAME" + 0.044*"eccsERR_READATTR" + 0.044*"eccsERR_WRITEATTR" + 0.044*"ccsERR_DB_INV_NAME" + 0.044*"eccsERR_NOASSIGN" + 0.044*"eccsERR_INFOATTR"

Topic: 3 Word: 0.140*"ic0fbERR_GEN" + 0.062*"evhERR_ERROR_HANDLING" + 0.060*"lccERR_IN

### Evaluation

In [83]:
doc_to_eval = 10
print(docs_train[doc_to_eval])
for index, score in sorted(lda_model_tfidf[bow_corpus[doc_to_eval]], key=lambda tup: -1*tup[1]):
    print(f"\nScore: {score}\t \nTopic: {lda_model.print_topic(index, 10)}")

['ccsERR_PROC_INFO', 'gvacqERR_READ_DBLIST', 'ic0lcuERR_SEM_CMD_LOCKED', 'lcctooERR_SEM_TAKE']

Score: 0.8390069603919983	 
Topic: 0.282*"gvdlERR_LCU_REPLY" + 0.072*"bossERR_CMD_FAILED" + 0.052*"bossERR_SUBSYSTEM_REPLY" + 0.052*"ic0fbERR_DEVICE_OP" + 0.052*"ic0fbERR_GEN" + 0.052*"evhERR_CMD_ERR_REPLY" + 0.034*"ccsERR_DB_QUALITY" + 0.024*"ccsERR_MSG_TIMEOUT" + 0.024*"bossERR_START_PREPROC" + 0.024*"issifERR_READ_REMOTE"

Score: 0.04080135002732277	 
Topic: 0.120*"ccsERR_PROC_INFO" + 0.096*"ic0lcuERR_SEM_CMD_LOCKED" + 0.096*"lcctooERR_SEM_TAKE" + 0.092*"gvacqERR_READ_DBLIST" + 0.052*"ic0fbERR_GEN" + 0.041*"ipclERR_SEM_WAIT" + 0.041*"ipclERR_SEM_TIMEOUT" + 0.032*"issprsERR_COMMAND_FAILED" + 0.031*"bossERR_SUBSYSTEM_REPLY" + 0.031*"ic0ERR_GENERAL"

Score: 0.04010041803121567	 
Topic: 0.131*"eccsERR_NOASSIGN" + 0.131*"eccsERR_INFOATTR" + 0.131*"eccsERR_READATTR" + 0.131*"ccsERR_DB_INV_NAME" + 0.131*"eccsERR_WRITEATTR" + 0.116*"lccERR_INV_NAME" + 0.039*"lcctooERR_SEM_TAKE" + 0.039*"ic0lcuERR

### Testing

In [91]:
doc_to_test = 10
bow_vector = dictionary.doc2bow(docs_test[doc_to_test])
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}\n".format(score, lda_model.print_topic(index, 5)))

Score: 0.5379564166069031	 Topic: 0.129*"lccERR_INV_NAME" + 0.127*"eccsERR_WRITEATTR" + 0.127*"eccsERR_READATTR" + 0.127*"eccsERR_INFOATTR" + 0.127*"eccsERR_NOASSIGN"

Score: 0.21034331619739532	 Topic: 0.093*"ic0fbERR_GEN" + 0.066*"ic0lcuERR_SEM_CMD_LOCKED" + 0.065*"lcctooERR_SEM_TAKE" + 0.061*"evhERR_CMD_ERR_REPLY" + 0.053*"bossERR_INVALID_STATE"

Score: 0.14819219708442688	 Topic: 0.082*"ipclERR_SEM_TIMEOUT" + 0.081*"ipclERR_SEM_WAIT" + 0.069*"ccsERR_DB_INV_NAME" + 0.069*"eccsERR_INFOATTR" + 0.066*"eccsERR_READATTR"

Score: 0.09489299356937408	 Topic: 0.125*"eccsERR_NOASSIGN" + 0.125*"eccsERR_INFOATTR" + 0.125*"ccsERR_DB_INV_NAME" + 0.124*"eccsERR_READATTR" + 0.124*"eccsERR_WRITEATTR"

