<a href="https://colab.research.google.com/github/joe-mcnealPW/jiraConnector/blob/main/discovery_health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install nltk
!pip install pyLDAvis

In [None]:
!pip install pyLDAvis==3.2.2


In [50]:
import numpy as np
import json
import glob
import pandas as pd

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
import nltk
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [7]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Helper Function Setup

In [8]:
def load_data(file_path):
  with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)
  return (data)

def write_data(file_path, data):
  with open(file_path, "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4)

In [69]:
def unique_by_sh_msg(full_list):
    # initialize a null list
    unique_list = []
  
    # traverse for all elements
    for item in full_list:
      short_message = item["short_message"]

      if short_message not in unique_list:
        unique_list.append(short_message)
    
    return unique_list

# Data Cleanup

In [72]:
data = load_data("/content/primary_data/discovery_logs.json")

#df = pd.DataFrame(data=data)
#short_messages = df.loc[:,"short_message"]

unique_data = unique_by_sh_msg(data)
print(len(data))
print(len(unique_data))
  

24073
1940


# Generate Bag of Words

In [74]:
def lemmatization(logs, allowed_postags=["NOUN","ADJ", "VERB", "ADV", "PROPN"]):
  nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
  texts_out = []

  for log in logs:
    doc = nlp(log)
    new_text = []

    for token in doc:
      if token.pos_ in allowed_postags:
        new_text.append(token.lemma_)
      final = " ".join(new_text)
      texts_out.append(final)
  return (texts_out)

In [75]:
lemmatized_texts = lemmatization(unique_data)

In [76]:
print(lemmatized_texts[0:100])

['', 'host', 'host /partition_internal', 'host /partition_internal', 'host /partition_internal odiprod.trinet.com', 'host /partition_internal odiprod.trinet.com', 'host /partition_internal odiprod.trinet.com', 'host /partition_internal odiprod.trinet.com code]<a', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa5ed144b02b', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa5ed144b02b', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa5ed144b02b target="_blank"><u>/partition_internal', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa5ed144b02b target="_blank"><u>/partition_internal', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa

In [14]:
def gen_words(texts):
  final = []
  for text in texts: 
    new = gensim.utils.simple_preprocess(text)
    final.append(new)
  
  return final

In [77]:
data_words = gen_words(lemmatized_texts)

print(data_words[1000:5000])

[['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip', 'mon'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip', 'mon', 'sep'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip', 'mon', 'sep'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip', 'mon', 'sep'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip', 'mon', 'sep'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip', 'mon', 'sep'], ['mon', 'sep', 'so', 'warn', 'pid', 'tid', 'ah', 'module', 'already', 'load', 'skip', 'mon', 'sep'], ['mon', 'sep', 'so', 'warn', 'pid', 't

In [89]:
print(len(data_words))

67842


In [97]:
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
  return (bigram[doc] for doc in texts)

def make_trigrams(texts):
  return (trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

In [100]:
phrases = bigram_phrases.export_phrases
print(b)

print(len(list(data_bigrams)))

<bound method Phrases.export_phrases of <gensim.models.phrases.Phrases object at 0x7f0dbe2af150>>
0


In [79]:
id2word = corpora.Dictionary(data_words)

In [80]:
corpus = []

for text in data_words:
  new = id2word.doc2bow(text)
  corpus.append(new)

print(corpus[0:100])

[[], [(0, 1)], [(0, 1)], [(0, 1)], [(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1

In [81]:
print(len(corpus))

67842


In [92]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto'
                                            )

In [93]:
pyLDAvis.enable_notebook()
print(pyLDAvis)
vis = gensimvis.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

<module 'pyLDAvis' from '/usr/local/lib/python3.7/dist-packages/pyLDAvis/__init__.py'>


  default_term_info = default_term_info.sort_values(
