In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
## 2023 여름 이후 gensim 패키지 import 오류 해결용
pip install gensim==3.4.0
pip install smart_open==1.9.0
pip install -U pyopenssl cryptography

In [None]:
## Made by JongHyun Kim

## Base model made on July, 2022
## Repaired on Apr, 2023 
### Apr 2023(The problem that sentences combined in one str data is recognized as one sentence and not operating properly.--> Added the sentence_tokenizer process in the __init__ function)
### Apr 2023(Made the list generated by the LDA contain the index number of topic that has the maximum importance weight)

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import _stop_words
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.decomposition import TruncatedSVD
import gensim
from gensim import corpora

import warnings
warnings.filterwarnings("ignore")
import numpy as np

## Added on May 30, 2023 - for Spherical Kmeans
from scipy.sparse import csr_matrix
from soyclustering import SphericalKMeans
import gap_statistic
from soyclustering import proportion_keywords

## For LLM embedding models
import ollama ## LLAMA3

OPENAI_APIKEY = ""
from openai import OpenAI
client = OpenAI(api_key=f"{OPENAI_APIKEY}")




## Need to add Autoencoder


nltk_stopwords = set(stopwords.words('english'))
sklearn_stopwords = set(_stop_words.ENGLISH_STOP_WORDS)
my_stopwords = nltk_stopwords.union(sklearn_stopwords)
ps = PorterStemmer()
lem = WordNetLemmatizer()

## you can tag the 'part of speech' with this to the word
def get_pos(w):
    tag = pos_tag([w])[0][1][0].upper()
    if tag == 'V':
        return wordnet.VERB
    elif tag == 'N':
        return wordnet.NOUN
    elif tag == 'J':
        return wordnet.ADJ
    elif tag == 'R':
        return wordnet.ADV
    else:
        return wordnet.NOUN

## Stemming based word tokenizer
def token_ws(r):
    r1 = word_tokenize(r)
    r2 = [w.lower() for w in r1 if w.isalpha()]
    r3 = [w for w in r2 if not w in my_stopwords]
    r4 = [ps.stem(w) for w in r3]
    return r4
  
## Lemmatizing based word tokenizer
def token_wl(r):
    r1 = word_tokenize(r)
    r2 = [w.lower() for w in r1 if w.isalpha()]
    r3 = [w for w in r2 if not w in my_stopwords]
    r4 = [lem.lemmatize(w, get_pos(w)) for w in r3]
    return r4


def llama3_embedding(text, model = 'llama3'):
    response =ollama.embeddings(model = model, prompt = text)
    embedding = response['embedding']
    return embedding

def get_openai_embedding(text, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

## Natural language processor
class Nlp:
    ## You can input texts in the form of list but if you input a str data, it will automatically transform it into the form of list
    def __init__(self, texts):
        if (type(texts) != list):
            try:
                test = sent_tokenize(texts)
                if len(test) > 1:
                    texts = test
            except:
                texts = [texts]
        self.texts = texts      

    ## Vectorizer : Choose which word tokenizer you'll use by typing 's' or 'l'. Choose the vectorizing tool by 'count' or 'tfidf'
    def Vectorizer(self, tokenizer, vec):
        my_tokenizer = tokenizer ## Choose a Tokenizer or an embedding model from the 4 options
        
        if vec == 'count':
            vectoring = CountVectorizer(tokenizer=my_tokenizer)
        elif vec == 'tfidf':
            vectoring = TfidfVectorizer(tokenizer=my_tokenizer)
            
        vec_reviews = vectoring.fit_transform(self.texts)
        vec_terms = vectoring.get_feature_names_out()
        vec_df = pd.DataFrame(vec_reviews.toarray(), columns=vec_terms)
        
        return vec_df
        
    ## Cosine similarity. you can input the same things you input in the Vectorizer
    ## This shows how similar each text are to each other
    def Cos_sim(self, tokenizer, vec):
        my_tokenizer = tokenizer ## Choose a Tokenizer or an embedding model from the 4 options
    
        if vec == 'count':
            vectoring = CountVectorizer(tokenizer=my_tokenizer)
        elif vec == 'tfidf':
            vectoring = TfidfVectorizer(tokenizer=my_tokenizer)
            
        vec_reviews = vectoring.fit_transform(self.texts)
        
        ## After going through the preprocessing, this visualize the similarity with Heatmap plot
        sim = cosine_similarity(vec_reviews)
        plt.figure(figsize=(8,8))
        sns.heatmap(sim, vmin = 0, vmax = 1, center = 0, cmap='PiYG', annot = True)
        plt.show()
    

    ## This performs clustering of your text data. you input the tokenizer and the vectorizing tool
    def Cluster(self, tokenizer, vec):
        my_tokenizer = tokenizer ## Choose a Tokenizer or an embedding model from the 4 options
    
        if vec == 'count':
            vectoring = CountVectorizer(tokenizer=my_tokenizer)
        elif vec == 'tfidf':
            vectoring = TfidfVectorizer(tokenizer=my_tokenizer)
            
        vec_reviews = vectoring.fit_transform(self.texts)
        vec_terms = vectoring.get_feature_names_out()
        vec_df = pd.DataFrame(vec_reviews.toarray(), columns=vec_terms)
        
        ## Here we go through the 'Elbow method' and will show you the graph of it
        distance = []
        K = range(1, 10)
        for k in K:
            km1 = KMeans(n_clusters=k)
            km2 = km1.fit(vec_reviews)
            d = km2.inertia_
            distance.append(d)
        
        plt.figure(figsize=(10,8))
        plt.plot(K, distance)
        plt.show()
        
        ## After showing the graph of 'Elbow method', you should input the number of cluster you will make
        num = int(input())
        
        ## Then this tool will go through Kmeans method and show you the pandas dataframe with a new column 'cluster'
        km = KMeans(n_clusters=num, random_state=5)
        km.fit(vec_reviews)
        group = km.labels_.tolist()
        vec_df['cluster'] = group
        return vec_df
    
    def Spherical_Kmeans(self, tokenizer, vec):
        my_tokenizer = tokenizer ## Choose a Tokenizer or an embedding model from the 4 options
    
        if vec == 'count':
            vectoring = CountVectorizer(tokenizer=my_tokenizer)
        elif vec == 'tfidf':
            vectoring = TfidfVectorizer(tokenizer=my_tokenizer)
            
        vec_reviews = vectoring.fit_transform(self.texts)
        vec_terms = vectoring.get_feature_names_out()
        vec_array = vec_reviews.toarray()
        vec_df = pd.DataFrame(vec_reviews.toarray(), columns=vec_terms)

        ## Gap statistics
        count_matrix = csr_matrix(vec_array)

        vec_float_array = vec_array.astype('float')
        optimalK = gap_statistic.OptimalK(n_jobs=-1, parallel_backend='joblib')
        num_clusters = optimalK(vec_float_array, cluster_array=np.arange(1, len(vec_float_array)))

        ## Spherical K-means based on num_clusters derived from gap statistics
        spherical_kmeans = SphericalKMeans(
            n_clusters= num_clusters,
            max_iter=100,
            verbose=1,
            init='similar_cut')
        labels = spherical_kmeans.fit_predict(count_matrix)

        vocabs = [vocab for vocab, idx in sorted(vectoring.vocabulary_.items(), key=lambda x:x[1])]
        centers = spherical_kmeans.cluster_centers_

        keywords = proportion_keywords(
            centers,
            labels=labels,
            index2word=vocabs)
        
        group = spherical_kmeans.labels_.tolist()
        
        vec_df['cluster'] = group

        return vec_df, keywords
        
    ## Sentiment Analysis tool
    def SA(self):
        try:
            sents = sent_tokenize(self.texts)
        except:
            sents = self.texts
            
        vader = SentimentIntensityAnalyzer()
        senti_list = []
        for s in sents:
            senti = vader.polarity_scores(s)
            senti_list.append(senti)
            
        senti_df = pd.DataFrame(senti_list)
        return senti_df
        
    ## Topic modeling by LDA. Input the tokenizer to use, number of topics you will extract, and the number of words you want to see in each topic
    ## for this Topic_LDA, should set two variable (x, y = Variable_Name.Topic_LDA('l', 3, 3)). x will be the list of each Topics, y will be the ratio of topics for each text you input
    def Topic_LDA(self, tokenizer, topic_num, word_num):
        my_tokenizer = tokenizer ## Choose a Tokenizer or an embedding model from the 4 options
        
        text_type = type(self.texts)
        if text_type != list:
            self.texts = list(self.texts)
            
        doc_1 = []
        for d in self.texts:
            d1 = my_tokenizer(d)
            doc_1.append(d1)
            
        gensim_terms = corpora.Dictionary(doc_1)
        doc_matrix = [gensim_terms.doc2bow(w) for w in doc_1]
        lda = gensim.models.ldamodel.LdaModel
        lda_model = lda(doc_matrix, num_topics = topic_num, id2word = gensim_terms, random_state = 0)
        topic_LDA_each = [lda_model[d] for d in doc_matrix]

        new_test_LDA_topics_each = []
        for each in topic_LDA_each:
            topic_importance = []
            for topics in each:
                topic_importance.append(topics[1])
            each.append(topic_importance.index(np.max(topic_importance)))
            new_test_LDA_topics_each.append(each)

        return lda_model.print_topics(num_words = word_num), new_test_LDA_topics_each

    def Topic_SVD(self, tokenizer, vec ,component_num):
        svd = TruncatedSVD(n_components = component_num , random_state = 0)

        my_tokenizer = tokenizer ## Choose a Tokenizer or an embedding model from the 4 options
    
        if vec == 'count':
            vectoring = CountVectorizer(tokenizer=my_tokenizer)
        elif vec == 'tfidf':
            vectoring = TfidfVectorizer(tokenizer=my_tokenizer)

        vec_reviews = vectoring.fit_transform(self.texts)
        vec_terms = vectoring.get_feature_names_out()
        vec_df = pd.DataFrame(vec_reviews.toarray(), columns=vec_terms)

        svd.fit_transform(vec_reviews)
        svd_topics = svd.components_.argsort()[:,::-1]

        svd_singular_values = svd.singular_values_

        svd_length = len(svd_topics)
        svd_topic_words = []    
        for i in range(0,svd_length):
            top_words = [vec_terms[x] for x in svd_topics[i, :-1]]
            svd_topic_words.append(top_words)

        return svd_topic_words, svd_singular_values

## BERTopic will be added


In [None]:
doc1 = "I'm extremely happy and I would reccomend this TV to anyone who is looking for a great TV at an even better price."
doc2 = "The price was reasonable and the Roku remote is easy."
doc3 = "I give it 5 stars for price and quality. "
doc4 = "I have no complaints about the image quality."
doc5 = "The menus are just so much more smooth and easy to use."
doc6 = "The remote is simple and easy to use."

In [None]:
doc = [doc1, doc2, doc3, doc4, doc5, doc6]

In [None]:
doc_test = " ".join(doc)

In [None]:
review = Nlp(doc)

In [None]:
review.texts

In [None]:
review_test = Nlp(doc_test)

In [None]:
review_test.texts

In [None]:
review_test.Topic_LDA('l', 3, 5)

In [None]:
review.Topic_LDA('l', 3, 3)

In [None]:
len(review.Vectorizer('l', 'tfidf').columns)

In [None]:
review.Cos_sim('l', 'tfidf')

In [None]:
review.Cluster('l', 'tfidf')

In [None]:
svd_list, svd_df = review.Topic_SVD('l', 'tfidf', 3)
svd_df

In [None]:
svd_list

In [None]:
review.SA()

In [None]:
doc_2 = 'I am a dog'

In [None]:
esteem = Nlp(doc_2)

In [None]:
esteem.texts

In [None]:
esteem.Vectorizer('l', 'tfidf') ## this will not work since there is only one sentence

In [None]:
esteem.Topic_LDA('l', 3, 3)

In [None]:
esteem.Cos_sim('l', 'tfidf')

In [None]:
esteem.Cluster('l', 'tfidf')

In [None]:
esteem.SA()

In [None]:
text1 = 'Hi we are bada.'
text2 = 'Today we are going through NLP session.'
text3 = 'This is a very challenging course.'
text4 = 'Please concentrate and try to understand the structure.'

In [None]:
text = [text1, text2, text3, text4]

In [None]:
text_test = " ".join(text)

In [None]:
Text = Nlp(text)

In [None]:
Text_test = Nlp(text_test)

In [None]:
Text_test.texts

In [None]:
Text.Vectorizer('l', 'tfidf')

In [None]:
Text_test.Vectorizer('l', 'tfidf')

In [None]:
Text.Vectorizer('l', 'count')

In [None]:
Text.Cos_sim('l', 'tfidf')

In [None]:
Text.Cos_sim('ㅣ', 'count')

In [None]:
Text.Cluster('l', 'tfidf')

In [None]:
Text.SA()

In [None]:
Text_test.SA()

In [None]:
Text.Topic_LDA('l', 3, 3)

In [None]:
Text_test.Topic_LDA('l', 3, 3)