<a href="https://colab.research.google.com/github/joe-mcnealPW/jiraConnector/blob/main/discovery_health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install nltk
!pip install pyLDAvis

In [None]:
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
#import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
nltk.download("stopwords")

# Helper Function Setup

In [None]:
def load_data(file_path):
  with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)
  return (data)

def write_data(file_path, data):
  with open(file_path, "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4)

# Data Cleanup

In [None]:
data = load_data("/content/primary_data/discovery_logs.json")

print(data[0]['short_message'])
print(stopwords)

# Generate Bag of Words

In [76]:
def lemmatization(logs, allowed_postags=["NOUN","ADJ", "VERB", "ADV", "PROPN"]):
  nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
  texts_out = []

  for log in logs:
    short_message = log['short_message']
    doc = nlp(short_message)
    new_text = []

    for token in doc:
      if token.pos_ in allowed_postags:
        new_text.append(token.lemma_)
      final = " ".join(new_text)
      texts_out.append(final)
  return (texts_out)

lemmatized_texts = lemmatization(data)

print(lemmatized_texts[0])




In [74]:
def unique(full_list):
    # initialize a null list
    unique_list = []
  
    # traverse for all elements
    for item in full_list:
        # check if exists in unique_list or not
        if item not in unique_list:
            unique_list.append(item)
    
    return unique_list



In [78]:
print(lemmatized_texts[0:100])

['', 'host', 'host /partition_internal', 'host /partition_internal', 'host /partition_internal odiprod.trinet.com', 'host /partition_internal odiprod.trinet.com', 'host /partition_internal odiprod.trinet.com', 'host /partition_internal odiprod.trinet.com code]<a', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa5ed144b02b', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa5ed144b02b', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa5ed144b02b target="_blank"><u>/partition_internal', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa5ed144b02b target="_blank"><u>/partition_internal', 'host /partition_internal odiprod.trinet.com code]<a href="/nav_to.do?uri=%2fcmdb_ci.do%3fsys_id%3dcb8600d71353db840af6fa

In [81]:
def gen_words(texts):
  final = []
  for text in texts: 
    new = gensim.utils.simple_preprocess(text)
    final.append(new)
  
  return final

In [83]:
data_words = gen_words(lemmatized_texts)

print(data_words[1000:5000])

[['add', 'target', 'blacklist', 'valid'], ['add', 'target', 'blacklist', 'valid', 'credential'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find', 'type'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find', 'type'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find', 'type', 'ssh'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find', 'type', 'ssh', 'password'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find', 'type', 'ssh', 'password'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find', 'type', 'ssh', 'password', 'ssh'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find', 'type', 'ssh', 'password', 'ssh', 'private'], ['add', 'target', 'blacklist', 'valid', 'credential', 'find', 'type', 'ssh', 'password', 'ssh', 'private', 'key'], ['add', 'target', 'blacklist', 'valid', 'credential', 'fi

In [84]:
id2word = corpora.Dictionary(data_words)

In [85]:
corpus = []

for text in data_words:
  new = id2word.doc2bow(text)
  corpus.append(new)

print(corpus[0:100])

[[], [(0, 1)], [(0, 1)], [(0, 1)], [(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]


In [89]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=30,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto'
                                            )