# 영어 문장에서의 차원 축소

## 주제

차원 축소 기법들에 대해 간략하게 설명한 후,
자연어 처리 기법을 통해 영어 문장을 받아들이고 분석한 후 특성을 추출해 데이터화한 뒤,
이를 기반으로 간단한 알고리즘만을 사용하였을 때와 여러 다양한 차원 축소 기법을 동원하였을 때의 각 단어가 문장 전체의 의미에 미치는 영향 추출 성능 및 효율성 비교

최종적으로, 이러한 기법들을 동원하여 문장을 입력하면 해당 문장을 분석하여 각 단어의 색상(또는 배경 색상)을 해당 단어가 문장의 의미에 미치는 영향 수준에 맟게 칠하는 프로그램 작성.

## 구현 과정

1. 자연어 처리 부분은 nltk에서 기본으로 제공하는 함수들을 이용하고, 간단한 word2id 알고리즘을 구현하여 sparse matrix를 제작.
2. 필요에 따라 차원 축소 방안을 선택적으로 적용할 수 있도록 구현.
3. 차원 축소를 사용하지 않을 때와 서로 다른 차원 축소 방안을 사용할 때, 여러 개의 차원 축소 방안을 중첩하여 사용할 때의 결과의 변화와 성능 상의 차이를 탐구.
4. 위 과정에서 구현한 코드를 재사용하여, 주제에서 언급한 프로그램 제작. Python의 tkinter를 사용할 예정. 문장 입력 시 실시간으로 하이라이트해 주는 기능 추가.


In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import time
import tensorflow as tf
import colorsys
import cv2
import wordcloud
from PIL import Image, ImageTk
from tkinter import *
import tkinter
import tkinter.ttk
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, SimpleRNN, LSTM

# Download required NLTK DLCs
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize stop-words
stopwords=set(stopwords.words('english'))

# Prepare the data using TensorFlow
def prepare_tf(corpus: list[str]):
    # Drop stop-words from the corpus
    def preprocess_text(text):
        tokens = word_tokenize(text.lower())
        return [word for word in tokens if word not in stopwords]
    corpus = [' '.join(preprocess_text(doc)) for doc in corpus]

    # Tokenize and create sequences (using TensorFlow Tokenizer)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    sequences = tokenizer.texts_to_sequences(corpus)
    word_index = tokenizer.word_index

    # Padding sequences to ensure equal length
    max_length = max(len(seq) for seq in sequences)
    return pad_sequences(sequences, maxlen=max_length),len(sequences),word_index


# Prepare the data using NLTK
def prepare_nltk(corpus: list[str],window_size:int=2)->tuple[list[str],dict[str,int],np.ndarray]:
    # Tokenize corpus (using nltk word_tokenize)
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

    # Enumerate volabularies in the corpus and map to the int
    vocab = set(word for doc in tokenized_corpus for word in doc if word not in stopwords)
    vocab2idx = {word: i for i, word in enumerate(vocab)}

    return tokenized_corpus,vocab2idx,build_co_matrix(tokenized_corpus,vocab2idx,window_size)


# Build up the co-occurrence matrix from the corpus and word2index mapping
def build_co_matrix(tokenized_corpus: list[str], word2idx: dict[str,int], window_size: int = 2)->np.ndarray:
    co = np.zeros((len(word2idx), len(word2idx)))
    for corpus in tokenized_corpus:
        for i, word in enumerate(corpus):
            # Skip stop-words
            if word in stopwords:
                continue
            
            # Scan [i-ws, i+ws] to create co-occurrence matrix
            start = max(0, i - window_size)
            end = min(len(corpus), i + window_size + 1)
            for j in range(start, end):
                if i != j and corpus[j] in word2idx:
                    co[word2idx[word], word2idx[corpus[j]]] += 1
    return co


# PPMI(Positive Pairwise Mutual Information)
def ppmi(co_matrix: np.ndarray)->np.ndarray:
    total_sum = np.sum(co_matrix)
    word_sums = np.sum(co_matrix, axis=0)
    ppmi_matrix = np.maximum(np.log((co_matrix * total_sum) / ((word_sums[:, None] * word_sums[None, :]) + 1e-8) + 1e-8), 0)
    return ppmi_matrix


# Analyze the input sentence using specified embeddings to calculate the weight of each words(vocabularies) in the sentence.
def analyze(sentence:str, embeddings:np.ndarray, word2idx:dict[str,int]):
    # Tokenize the input sentence
    tokens = word_tokenize(sentence.lower())

    # Calculate mean for all words(vocabularies) in the input sentence from embeddings
    sentence_vector = np.mean([embeddings[word2idx[word]] for word in tokens if word in word2idx], axis=0)

    # Calculate weight for each word(vocabulary) in the input sentence
    weights = {}
    for word in tokens:
        if word in word2idx:
            word_vector = embeddings[word2idx[word]]
            weight = np.dot(word_vector, sentence_vector)
            weights[word] = weight
    return weights


# Dimension Reduction using the sklearn.decomposition.* classes (which support 'fit_transform' and 'inverse_transform')
def dr_with_sklearn(cls, embeddings: np.ndarray)->np.ndarray:
    scaler = StandardScaler()
    embeddings = scaler.fit_transform(embeddings)
    embeddings = cls.fit_transform(embeddings)
    embeddings = cls.inverse_transform(embeddings)
    embeddings = scaler.inverse_transform(embeddings)
    return embeddings


# Dimension Reduction using SVD (Singular Value Decomposition)
def dr_with_svd(embeddings: np.ndarray, n_components:int=2)->np.ndarray:
    scaler = StandardScaler()
    embeddings = scaler.fit_transform(embeddings)
    _,_,Vt=np.linalg.svd(embeddings) # SVD (We can safely drop U and E because what we need is Vt)
    embeddings=np.dot(embeddings,Vt[:n_components,:].T) # Inverse SVD
    embeddings=np.dot(embeddings,Vt[:n_components,:])
    embeddings = scaler.inverse_transform(embeddings)
    return embeddings


# Feature Extraction and Embedding using CNN
def dr_with_cnn(tokenized_data, seqcount, word2idx, embedding_dim=50):
    model = Sequential()
    model.add(Embedding(input_dim=len(word2idx) + 1, output_dim=embedding_dim))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return dr_with_model(model, tokenized_data, seqcount)


# Feature Extraction and Embedding using Simple RNN
def dr_with_rnn(tokenized_data, seqcount, word2idx, embedding_dim=50):
    model = Sequential()
    model.add(Embedding(input_dim=len(word2idx) + 1, output_dim=embedding_dim))
    model.add(SimpleRNN(50, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return dr_with_model(model, tokenized_data, seqcount)


# Feature Extraction and Embedding using LSTM
def dr_with_lstm(tokenized_data, seqcount, word2idx, embedding_dim=50):
    model = Sequential()
    model.add(Embedding(input_dim=len(word2idx) + 1, output_dim=embedding_dim))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return dr_with_model(model, tokenized_data, seqcount)


# Feature Extraction and Embedding using TensorFlow sequential model
def dr_with_model(model, tokenized_data, seqcount):
    # Compile and train the model
    labels = np.array([1 for _ in range(seqcount)])  # Dummy labels for training
    model.fit(tokenized_data, labels, epochs=60, verbose=0)

    # Extract word embeddings
    return model.layers[0].get_weights()[0]


def process(corpus: str, mode: str, options: dict):
    nltk_mode=['pca','svd','tsvd','ipca']
    tf_mode=['cnn','rnn','lstm']
    mode=mode.lower()
    if mode in nltk_mode:
        # Prepare for NLTK
        tokenized_corpus,word2idx,embeddings = prepare_nltk(corpus,options.get('window_size',2))

        # Apply PPMI
        if options.get('ppmi',True):
            embeddings=ppmi(embeddings)

        # Perform dimension reduction
        n_components=options.get('n_components',2)
        if mode=='pca':
            embeddings=dr_with_sklearn(PCA(n_components=n_components), embeddings)
        elif mode=='svd':
            embeddings=dr_with_svd(embeddings, n_components)
        elif mode=='tsvd':
            embeddings=dr_with_sklearn(TruncatedSVD(n_components=n_components), embeddings)
        elif mode=='ipca':
            embeddings=dr_with_sklearn(IncrementalPCA(n_components=n_components), embeddings)
    elif mode in tf_mode:
        tokenized_data,seqcount,word2idx=prepare_tf(corpus)
        embedding_dim=options.get('embedding_dim',50)
        if mode=='cnn':
            embeddings=dr_with_cnn(tokenized_data,seqcount,word2idx,embedding_dim)
        elif mode=='rnn':
            embeddings=dr_with_rnn(tokenized_data,seqcount,word2idx,embedding_dim)
        elif mode=='lstm':
            embeddings=dr_with_lstm(tokenized_data,seqcount,word2idx,embedding_dim)
    else:
        raise ValueError(f'Unknown mode: {mode}')

    return analyze(' '.join(corpus), embeddings, word2idx)

tk=Tk()
tk.title('Word Effect Calculator')

modebox=tkinter.ttk.Combobox(tk, height=15, values=['PCA','SVD','TSVD','IPCA','CNN','RNN','LSTM'])
modebox.pack(anchor='n',fill='both')
modebox.set('SVD')

para_input=Text(tk)
para_input.pack(anchor='n',expand=True,fill='both')

wc_canvas=Canvas(tk,width=600,height=400)
wc_canvas.pack(anchor='center', fill='both')
canvas_img = None #wc_canvas.create_image(0,0,anchor='nw')

undirty=False
prev_result=None

def mse(a1,a2):
    s1=set(a1.keys())
    s2=set(a2.keys())
    sxor=s1.symmetric_difference(s2)
    s=0
    for w in sxor:
        if w in a1:
            s += a1[w]**2
        else:
            s += a2[w]**2
    for w in s1.intersection(s2):
        s += (a1[w]-a2[w])**2
    return s/len(a1)


def text_updated():
    global undirty
    global canvas_img
    global prev_result
    if undirty:
        return
    
    undirty=True
    try:
        tk.call(para_input, 'edit', 'modified', 0)
    finally:
        undirty=False

    text=para_input.get(1.0,END)
    if len(text)<10:
        return
    t=time.perf_counter_ns()
    result=process(text.split('. '), modebox.get(), {})
    print(f'Took {(time.perf_counter_ns()-t)}ns to process.')
    if prev_result != None:
        print(f'MSE: {mse(result, prev_result):.20f}')
    prev_result=result
    min_w=999999
    max_w=0
    for word in result:
        weight=result[word]
        min_w=min(min_w,weight)
        max_w=max(max_w,weight)
        
    t=time.perf_counter_ns()
    sortd=list(result.items())
    sortd.sort(key=lambda x: len(x[0]))
    text=' '+text
    for word, weight in sortd:
        if min_w<max_w:
            weight_normalized=(weight-min_w)/(max_w-min_w)
        else:
            weight_normalized=weight
        idx=0
        while True:
            found=text.find(f' {word} ',idx)
            if found==-1:
                break
            #print(f'Highlight added for {found} to {found+len(word)} for word {word} with weight {weight_normalized}')
            idx=found+len(word)
            para_input.tag_add(word, f'1.0 + {found} chars', f'1.0 + {found+len(word)} chars')

        color_hex='#%02x%02x%02x' % tuple(map(lambda x: int(x*255),colorsys.hsv_to_rgb(weight_normalized, max(0.4,weight_normalized), 1.0)))
        
        para_input.tag_config(word, background=color_hex)
    print(f'Took {(time.perf_counter_ns()-t)}ns to highlight.')

    t=time.perf_counter_ns()
    wc=wordcloud.WordCloud(background_color='white', width=600, height=400)
    wc.generate_from_frequencies(result)
    wc_canvas.image_=ImageTk.PhotoImage(wc.to_image())
    
    if not canvas_img:
        canvas_img = wc_canvas.create_image(0,0,anchor='nw', image=wc_canvas.image_)
    else:
        wc_canvas.itemconfig(canvas_img, image=wc_canvas.image_)
    print(f'Took {(time.perf_counter_ns()-t)}ns to draw Word Cloud.')
    
    
para_input.bind_all('<<Modified>>', lambda event: text_updated())
para_input.bind_all('<F5>', lambda event: text_updated())
modebox.bind_all('<<ComboboxSelected>>', lambda event: text_updated())

#but=Button(tk,text='Update',command=text_updated)
#but.pack()
tk.mainloop()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\eric\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eric\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Took 5670400ns to process.
Took 5433700ns to highlight.
Took 159225900ns to draw Word Cloud.
Took 7888300ns to process.
MSE: 0.00000000000000000000
Took 1056400ns to highlight.
Took 160918700ns to draw Word Cloud.
Took 88074300ns to process.
MSE: 0.00000000000000000000
Took 2116600ns to highlight.
Took 154287100ns to draw Word Cloud.
Took 45199700ns to process.
MSE: 0.00000000000000000000
Took 1320300ns to highlight.
Took 139347200ns to draw Word Cloud.
Took 33361400ns to process.
MSE: 0.00000000000000000000
Took 1325400ns to highlight.
Took 139010900ns to draw Word Cloud.
Took 44353200ns to process.
MSE: 0.00000000000000000000
Took 1274800ns to highlight.
Took 141827100ns to draw Word Cloud.
Took 50184500ns to process.
MSE: 0.00000000000000000000
Took 1695400ns to highlight.
Took 144728300ns to draw Word Cloud.
Took 34387500ns to process.
MSE: 0.00000000000000000000
Took 1380500ns to highlight.
Took 140223200ns to draw Word Cloud.
Took 68426600ns to process.
MSE: 0.0000000000000000000