In [40]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


## 引入 NLTK stopwords

In [41]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

print(stop_words)
print(len(stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [42]:
data = [
    'Convolutional neural network (CNN) delivers impressive achievements in computer vision and machine learning field. However, CNN incurs high computational complexity, especially for vision quality applications because of large image resolution. In this paper, we propose an iterative architecture-aware pruning algorithm with adaptive magnitude threshold while cooperating with quality-metric measurement simultaneously. We show the performance improvement applied on vision quality applications and provide comprehensive analysis with flexible pruning configuration. With the proposed method, the Multiply-Accumulate (MAC) of state-of-the-art low-light imaging (SID) and super-resolution (EDSR) are reduced by 58% and 37% without quality drop, respectively. The memory bandwidth (BW) requirements of convolutional layer can be also reduced by 20% to 40%.',
    'Traditional image signal processing (ISP) pipeline consists of a set of individual image processing components onboard a camera to reconstruct a high-quality sRGB image from the sensor raw data. Due to the hand-crafted nature of the ISP components, traditional ISP pipeline has limited reconstruction quality under challenging scenes. Recently, the convolutional neural networks (CNNs) have demonstrated their competitiveness in solving many individual image processing problems, such as image denoising, demosaicking, white balance and contrast enhancement. However, it remains a question whether a CNN model can address the multiple tasks inside an ISP pipeline simultaneously. We make a good attempt along this line and propose a novel framework, which we call CameraNet, for effective and general ISP pipeline learning. The CameraNet is composed of two CNN modules to account for two sets of relatively uncorrelated subtasks in an ISP pipeline: restoration and enhancement. To train the two-stage CameraNet model, we specify two groundtruths that can be easily created in the common workflow of photography. CameraNet is trained to progressively address the restoration and the enhancement subtasks with its two modules. Experiments show that the proposed CameraNet achieves consistently compelling reconstruction quality on three benchmark datasets and outperforms traditional ISP pipelines.',
    'We present ViLBERT (short for Vision-and-Language BERT), a model for learning task-agnostic joint representations of image content and natural language. We extend the popular BERT architecture to a multi-modal two-stream model, pro-cessing both visual and textual inputs in separate streams that interact through co-attentional transformer layers. We pretrain our model through two proxy tasks on the large, automatically collected Conceptual Captions dataset and then transfer it to multiple established vision-and-language tasks -- visual question answering, visual commonsense reasoning, referring expressions, and caption-based image retrieval -- by making only minor additions to the base architecture. We observe significant improvements across tasks compared to existing task-specific models -- achieving state-of-the-art on all four tasks. Our work represents a shift away from learning groundings between vision and language only as part of task training and towards treating visual grounding as a pretrainable and transferable capability.'
]

## 做 Tokenization

In [43]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [44]:
## data pre-processing

# remove escape
data = [re.sub("\'", "", sent) for sent in data]

# remove new line
data = [re.sub('\s+', ' ', sent) for sent in data]


In [45]:
pprint(data[:1])

['Convolutional neural network (CNN) delivers impressive achievements in '
 'computer vision and machine learning field. However, CNN incurs high '
 'computational complexity, especially for vision quality applications because '
 'of large image resolution. In this paper, we propose an iterative '
 'architecture-aware pruning algorithm with adaptive magnitude threshold while '
 'cooperating with quality-metric measurement simultaneously. We show the '
 'performance improvement applied on vision quality applications and provide '
 'comprehensive analysis with flexible pruning configuration. With the '
 'proposed method, the Multiply-Accumulate (MAC) of state-of-the-art low-light '
 'imaging (SID) and super-resolution (EDSR) are reduced by 58% and 37% without '
 'quality drop, respectively. The memory bandwidth (BW) requirements of '
 'convolutional layer can be also reduced by 20% to 40%.']


In [46]:
data_words = list(sent_to_words(data))
data_words

[['convolutional',
  'neural',
  'network',
  'cnn',
  'delivers',
  'impressive',
  'achievements',
  'in',
  'computer',
  'vision',
  'and',
  'machine',
  'learning',
  'field',
  'however',
  'cnn',
  'incurs',
  'high',
  'computational',
  'complexity',
  'especially',
  'for',
  'vision',
  'quality',
  'applications',
  'because',
  'of',
  'large',
  'image',
  'resolution',
  'in',
  'this',
  'paper',
  'we',
  'propose',
  'an',
  'iterative',
  'architecture',
  'aware',
  'pruning',
  'algorithm',
  'with',
  'adaptive',
  'magnitude',
  'threshold',
  'while',
  'cooperating',
  'with',
  'quality',
  'metric',
  'measurement',
  'simultaneously',
  'we',
  'show',
  'the',
  'performance',
  'improvement',
  'applied',
  'on',
  'vision',
  'quality',
  'applications',
  'and',
  'provide',
  'comprehensive',
  'analysis',
  'with',
  'flexible',
  'pruning',
  'configuration',
  'with',
  'the',
  'proposed',
  'method',
  'the',
  'multiply',
  'accumulate',
  'mac',

In [47]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=1, threshold=10) # higher threshold fewer phrases.
# trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
# print(trigram_mod[bigram_mod[data_words[0]]])
print(bigram_mod[data_words[0]])

['convolutional_neural', 'network', 'cnn', 'delivers', 'impressive', 'achievements', 'in', 'computer', 'vision_and', 'machine', 'learning', 'field', 'however', 'cnn', 'incurs', 'high', 'computational', 'complexity', 'especially', 'for_vision', 'quality_applications', 'because', 'of', 'large', 'image', 'resolution', 'in', 'this', 'paper', 'we', 'propose', 'an', 'iterative', 'architecture', 'aware', 'pruning', 'algorithm', 'with', 'adaptive', 'magnitude', 'threshold', 'while', 'cooperating', 'with', 'quality', 'metric', 'measurement', 'simultaneously_we', 'show', 'the', 'performance', 'improvement', 'applied', 'on', 'vision_quality', 'applications', 'and', 'provide', 'comprehensive', 'analysis', 'with', 'flexible', 'pruning', 'configuration', 'with', 'the_proposed', 'method', 'the', 'multiply', 'accumulate', 'mac', 'of', 'state_of', 'the_art', 'low', 'light', 'imaging', 'sid', 'and', 'super', 'resolution', 'edsr', 'are', 'reduced_by', 'and', 'without', 'quality', 'drop', 'respectively', 

In [48]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [49]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['convolutional_neural', 'network', 'cnn', 'deliver', 'impressive', 'achievement', 'computer', 'vision', 'machine', 'learning', 'field', 'however', 'cnn', 'incur', 'high', 'computational', 'complexity', 'especially', 'vision_quality', 'application', 'large', 'image', 'resolution', 'paper', 'propose', 'iterative', 'architecture', 'aware', 'prune', 'algorithm', 'adaptive', 'magnitude', 'threshold', 'cooperate', 'quality', 'metric', 'measurement', 'simultaneously', 'show', 'performance', 'improvement', 'apply', 'vision_quality', 'application', 'provide', 'comprehensive', 'analysis', 'flexible', 'prune', 'configuration', 'propose', 'method', 'multiply', 'accumulate', 'mac', 'state', 'art', 'low', 'light', 'image', 'sid', 'super', 'resolution', 'edsr', 'reduce', 'quality', 'drop', 'respectively', 'memory', 'bandwidth', 'bw', 'requirement', 'convolutional', 'layer', 'also', 'reduce']]


## LDA 的輸入

使用 gensim 的 LDA 方法時，最重要的兩個輸入是 `corpus` 和 `id2word`。`id2word` 是一個 Dictionary 物件，細節可以參考 [corpora.dictionary – Construct word<->id mappings](https://radimrehurek.com/gensim/corpora/dictionary.html) 的說明。透過 `corpora.Dictionary` 來建立 word 和 id 之間的關係。

In [52]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 2), (54, 2), (55, 2), (56, 1), (57, 2), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2)]]


In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis