# AI 전문가 교육과정 실습 2 - part 2

***
### NLP응용: 토픽 추출
Applied Natrual Language Processing: Topic Modeling

강사: 차미영 교수 (카이스트 전산학부)    
조교: 신민기, 정현규 (카이스트 전산학부)

# Gensim LDA, Visualization

In [1]:
!pip install pyLDAvis==3.2.2



In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

# spacy for lemmatization
import spacy
import en_core_web_sm

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  from imp import reload


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AI_15\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AI_15\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AI_15\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Topic modeling with Gensim LDA

Gensim: Gensim is a free open-source Python library for representing documents as semantic vectors, as efficiently (computer-wise) and painlessly (human-wise) as possible. Gensim is designed to process raw, unstructured digital texts (“plain text”) using unsupervised machine learning algorithms.

More detailed information: https://radimrehurek.com/gensim/index.html

### Load data

In [4]:
dataset = fetch_20newsgroups(shuffle=True,
                            random_state=32,
                            remove=('headers', 'footers', 'qutes'))

In [5]:
news_df = pd.DataFrame({'News': dataset.data,
                       'Target': dataset.target})
news_df['Target_name'] = news_df['Target'].apply(lambda x: dataset.target_names[x])
news_df

Unnamed: 0,News,Target,Target_name
0,The real question here in my opinion is what M...,4,comp.sys.mac.hardware
1,Please could someone in the US give me the cur...,4,comp.sys.mac.hardware
2,Can somebody please help me with information a...,12,sci.electronics
3,In article <2077@rwing.UUCP> pat@rwing.UUCP (P...,16,talk.politics.guns
4,"From article <1pq6i2$a1f@news.ysu.edu>, by ak2...",7,rec.autos
...,...,...,...
11309,In article <1qvs9t$q3f@usenet.INS.CWRU.Edu> Ch...,4,comp.sys.mac.hardware
11310,"Hi,\nI've got a Multi I/O card (IDE controller...",3,comp.sys.ibm.pc.hardware
11311,\n As a person who has rarely even SEEN Do...,10,rec.sport.hockey
11312,>> So they should sue the newspaper I got it f...,1,comp.graphics


### Preprocessing

In [6]:
data = news_df.News.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [7]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
data_words = list(sent_to_words(data))
print(data_words[:1])

[['the', 'real', 'question', 'here', 'in', 'my', 'opinion', 'is', 'what', 'motorola', 'processors', 'running', 'system', 'on', 'mac', 'are', 'comparable', 'to', 'what', 'intel', 'processors', 'running', 'windows', 'on', 'pc', 'recall', 'there', 'being', 'conversation', 'here', 'that', 'running', 'windows', 'benchmarks', 'at', 'about', 'the', 'same', 'speed', 'as', 'mhz', 'in', 'system', 'dont', 'know', 'if', 'that', 'is', 'true', 'but', 'would', 'love', 'to', 'hear', 'if', 'anyone', 'has', 'any', 'technical', 'data', 'on', 'this', 'david']]


### Adding bigrams + trigrams

In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100) #

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])

['the', 'real', 'question', 'here', 'in', 'my', 'opinion', 'is', 'what', 'motorola', 'processors', 'running', 'system', 'on', 'mac', 'are', 'comparable', 'to', 'what', 'intel', 'processors', 'running', 'windows', 'on', 'pc', 'recall', 'there', 'being', 'conversation', 'here', 'that', 'running', 'windows', 'benchmarks', 'at', 'about', 'the', 'same', 'speed', 'as', 'mhz', 'in', 'system', 'dont', 'know', 'if', 'that', 'is', 'true', 'but', 'would', 'love', 'to', 'hear', 'if', 'anyone', 'has', 'any', 'technical', 'data', 'on', 'this', 'david']


In [9]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['real', 'question', 'opinion', 'processor', 'run', 'system', 'comparable', 'intel', 'processor', 'run', 'pc', 'recall', 'conversation', 'run', 'benchmark', 'speed', 'mhz', 'system', 'know', 'true', 'love', 'hear', 'technical', 'datum']]


### Create the dictionary and the corpus

In [11]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 3), (16, 1), (17, 2), (18, 1), (19, 1)]]


In [12]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('benchmark', 1),
  ('comparable', 1),
  ('conversation', 1),
  ('datum', 1),
  ('hear', 1),
  ('intel', 1),
  ('know', 1),
  ('love', 1),
  ('mhz', 1),
  ('opinion', 1),
  ('pc', 1),
  ('processor', 2),
  ('question', 1),
  ('real', 1),
  ('recall', 1),
  ('run', 3),
  ('speed', 1),
  ('system', 2),
  ('technical', 1),
  ('true', 1)]]

### Build LDA model

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.071*"internal" + 0.047*"creation" + 0.047*"mb" + 0.034*"brand" + '
  '0.032*"external" + 0.031*"justify" + 0.031*"split" + 0.029*"category" + '
  '0.028*"status" + 0.022*"usenet"'),
 (1,
  '0.029*"build" + 0.026*"return" + 0.026*"number" + 0.024*"player" + '
  '0.022*"chip" + 0.020*"point" + 0.016*"line" + 0.012*"date" + 0.012*"round" '
  '+ 0.011*"charge"'),
 (2,
  '0.077*"ground" + 0.044*"fall" + 0.039*"land" + 0.034*"center" + '
  '0.030*"city" + 0.030*"input" + 0.029*"obvious" + 0.027*"russian" + '
  '0.023*"edge" + 0.020*"item"'),
 (3,
  '0.053*"mhz" + 0.045*"cell" + 0.044*"investigation" + 0.027*"processor" + '
  '0.023*"sheet" + 0.019*"socket" + 0.019*"app" + 0.017*"intel" + '
  '0.016*"diamond" + 0.013*"wipe"'),
 (4,
  '0.026*"believe" + 0.025*"say" + 0.024*"argument" + 0.022*"christian" + '
  '0.022*"claim" + 0.019*"true" + 0.017*"evidence" + 0.016*"man" + '
  '0.015*"life" + 0.015*"conclusion"'),
 (5,
  '0.142*"key" + 0.076*"image" + 0.048*"compile" + 0.042*"avoid" 

### Evaluate the LDA model

In [15]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -13.528916135797832


### Visualization

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

### Choose the number of topics

In [None]:
from tqdm import tqdm

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model =  gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=2,
                                           alpha='auto',
                                           per_word_topics=True)
        
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=35, step=6)

In [None]:
limit=35; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

### Finding the dominant topic in each sentence

In [None]:
optimal_model = model_list[1]

In [None]:
optimal_model.per_word_topics = False

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

In [None]:
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [None]:
df_dominant_topic.head(50)