In [None]:
# download libraries
# ----------
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [1]:
###' ################################################################################
###'
###' IMPORT LIBRARIES
###'
###'

### pandas and numpy
import pandas as pd
import numpy as numpy
import spacy

### punctuation, stop words and English language model
from string import punctuation
from spacy.lang.en.stop_words import STOP_WORDS
from spellchecker import SpellChecker
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import en_core_web_sm
nlp = en_core_web_sm.load()
import scattertext as st

### textblob
from textblob import TextBlob

### countvectorizer, tfidfvectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.manifold import TSNE

### tqdm
from tqdm import tqdm

### gensim
import gensim
from gensim import models

### PCA
import random
from adjustText import adjust_text

### plotting
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### kMeans and silhouette scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

### ignore warnings
import warnings
warnings.filterwarnings('ignore')

###time
from datetime import datetime

In [2]:
import os
os.getcwd()

'C:\\Users\\OWNER\\TopicModeling'

# 1. Data preprocessing

In [3]:
# load the data
# ----------

#f = pd.read_csv("articles_data.csv")
#f.head()
#len(df)

df_14 = pd.read_csv("articles_data_14.csv")
df_24 = pd.read_csv("articles_data_24.csv")
    
df_14['Published'] = 0  # Code 0 for data from df_14
df_24['Published'] = 1  # Code 1 for data from df_24

df_14 = df_14[df_14['Abstract'] != 'Abstract not available.']
df_24 = df_24[df_24['Abstract'] != 'Abstract not available.']

#df_a = pd.concat([df_14, df_24], ignore_index=True)
#df.head()

In [4]:
import re

In [8]:
df_14_S = df_14[['Title','Abstract','Published']].dropna()
df_14_S['Abstract'] = df_14_S['Abstract'].apply(lambda x: re.sub(r'<\d+|\*|†>', '', re.sub(r'\<.*?\>', '', x)))

df_24_S = df_24[['Title','Abstract','Published']].dropna()
df_24_S['Abstract'] = df_24_S['Abstract'].apply(lambda x: re.sub(r'<\d+|\*|†>', '', re.sub(r'\<.*?\>', '', x)))
#df_S.head()
#len(df_S)

In [6]:
#df_S.iloc[204:209]

spell = SpellChecker()

### text spell check
df_S['Title_spell'] = df_S['Title'].map(lambda x: spell.correction(x))
df_S['Abstract_spell'] = df_S['Abstract'].map(lambda x: spell.correction(x))
df_S.head()

# 2. tokenize

In [7]:
def rem_punc_stop(text):
    # When text is None
    if text is None:
        return []

    # Convert text to lowercase
    text = text.lower()

    # Define additional stop words
    stop_words = STOP_WORDS | {"abstract", "available", "student", "research", "study", "impact", "effect",
                               "result", "al", "et", "doi", "googlescholar", "google", "scholar", "textgoogle", 
                               "full", "crossref", "introduction", "background", "purpose" "aim", "objective","use","child"}

    # Define punctuation
    punc = set(punctuation)

    # Remove punctuation
    punc_free = "".join([ch for ch in text if ch not in punc])

    # Apply NLP processing
    doc = nlp(punc_free)

    # Tokenize and lemmatize
    text_lemma = " ".join([token.lemma_ for token in doc])

    # Filter tokens to remove URLs, stop words, and non-alphabetic tokens
    filtered_tokens = [word for word in text_lemma.split() if word not in stop_words and word.isalpha()]

    # Return filtered tokens for TfidfVectorizer
    return filtered_tokens

In [None]:
df_14_S['Title_tokens'] = df_14_S['Title'].map(lambda x: rem_punc_stop(x))
df_14_S['Title_join'] = df_14_S['Title_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else "")

df_14_S['Abstract_tokens'] = df_14_S['Abstract'].map(lambda x: rem_punc_stop(x))
df_14_S['Abstract_join'] = df_14_S['Abstract_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else "")


df_24_S['Title_tokens'] = df_14_S['Title'].map(lambda x: rem_punc_stop(x))
df_24_S['Title_join'] = df_14_S['Title_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else "")

df_24_S['Abstract_tokens'] = df_14_S['Abstract'].map(lambda x: rem_punc_stop(x))
df_14_S['Abstract_join'] = df_14_S['Abstract_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else "")


In [None]:
###' ################################################################################
###'
###' Apply the Function and Tokenize Text Column
###'
###'

### sample from the whole dataset
df_S['Title_tokens'] = df_S['Title'].map(lambda x: rem_punc_stop(x))
df_S['Title_join'] = df_S['Title_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else "")


df_S['Abstract_tokens'] = df_S['Abstract'].map(lambda x: rem_punc_stop(x))
df_S['Abstract_join'] = df_S['Abstract_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else "")



#df_S['Full_tokens'] = df_S['Full Text'].map(lambda x: rem_punc_stop(x))
#df_S['Full_join'] = df_S['Full_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else "")


df_S

In [None]:
#df_S.iloc[204:209]

# 3. Word Features

# 3.1. Word Counts

In [None]:
###' ################################################################################
###'
###' GENERATE TEXT FEATURES
###' e.g. text_len, word count, polarity, subjectivity
###'
###'


# text_len & count
df_S['Title_count'] = df_S['Title'].dropna().apply(lambda x: len(str(x).split()))
df_S['Abstract_count'] = df_S['Abstract'].dropna().apply(lambda x: len(str(x).split()))
#df_S['Full_count'] = df_S['Full Text'].dropna().apply(lambda x: len(str(x).split()))


In [None]:
sns.displot(df_S,            # specify data
            x="Title_count") # x-axis feature
plt.show()

In [None]:
sns.displot(df_S,              # specify data
            x="Abstract_count") # x-axis feature
plt.show()

In [None]:
#sns.displot(df_S,              # specify data
#            x="Full_count") # x-axis feature
#plt.show()

In [None]:
#df_S = df_S[df_S['Abstract_count'] >= 2000]
#filtered_df = df_S[df_S['Abstract_count'] < 20]
#pd.set_option('display.max_colwidth', None)
#filtered_df
#len(df_S)

## 3.2. WordFrequency

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : WORDS COUNT with TItle
###' _ unigrams
###'
###'


### most frequent unigrams 
countvec = CountVectorizer(min_df = 5, ngram_range=(1,1))
ngrams = countvec.fit_transform(df_S['Title_join'])      


### create a dataframe 
dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                    columns = countvec.get_feature_names_out()) 


# Sum and organize ngram frequencies
df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)


### feature names
feature_names = df_ngram.head(10)['ngrams'].values
print(feature_names)

### plot 
sns.barplot(x="ngrams", 
            y = "freq",
            data=df_ngram[0:25])
plt.xticks(rotation=90)
plt.show()

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : WORDS COUNT with Abstract
###' _ unigrams
###'
###'


### most frequent unigrams 
countvec = CountVectorizer(min_df = 5, ngram_range=(1,1))
ngrams = countvec.fit_transform(df_S['Abstract_join'])      


### create a dataframe 
dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                    columns = countvec.get_feature_names_out()) 


# Sum and organize ngram frequencies
df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)


### feature names
feature_names = df_ngram.head(10)['ngrams'].values
print(feature_names)

### plot 
sns.barplot(x="ngrams", 
            y = "freq",
            data=df_ngram[0:25])
plt.xticks(rotation=90)
plt.show()

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : WORDS COUNT with Full_text
###' _ unigrams
###'
###'


### most frequent unigrams 
countvec = CountVectorizer(min_df = 5, ngram_range=(1,1))
ngrams = countvec.fit_transform(df_S['Full_join'])      


### create a dataframe 
dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                    columns = countvec.get_feature_names_out()) 


# Sum and organize ngram frequencies
df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)


### feature names
feature_names = df_ngram.head(10)['ngrams'].values
print(feature_names)

### plot 
sns.barplot(x="ngrams", 
            y = "freq",
            data=df_ngram[0:25])
plt.xticks(rotation=90)
plt.show()

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : WORDS COUNT
###' _ bigrams, trigrams
###'
###'


### most frequent bigrams 
countvec = CountVectorizer(min_df = 3, ngram_range=(2,3))


### fit and transform on tokens
ngrams = countvec.fit_transform(df_S['Title_join'])      


### create a dataframe 
dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                    columns = countvec.get_feature_names_out()) 


# Sum and organize ngram frequencies
df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)


### feature names
feature_names = df_ngram.head(10)['ngrams'].values
print(feature_names)

### plot 
sns.barplot(x="ngrams", 
            y = "freq",
            data=df_ngram[0:25])
plt.xticks(rotation=90)
plt.show()

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : WORDS COUNT
###' _ bigrams, trigrams
###'
###'


### most frequent bigrams 
countvec = CountVectorizer(min_df = 5, ngram_range=(2,3))


### fit and transform on tokens
ngrams = countvec.fit_transform(df_S['Abstract_join'])      


### create a dataframe 
dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                    columns = countvec.get_feature_names_out()) 


# Sum and organize ngram frequencies
df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)


### feature names
feature_names = df_ngram.head(10)['ngrams'].values
print(feature_names)

### plot 
sns.barplot(x="ngrams", 
            y = "freq",
            data=df_ngram[0:25])
plt.xticks(rotation=90)
plt.show()

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : WORDS COUNT
###' _ bigrams, trigrams
###'
###'


### most frequent bigrams 
countvec = CountVectorizer(min_df = 5, ngram_range=(2,3))


### fit and transform on tokens
ngrams = countvec.fit_transform(df_S['Full_join'])      


### create a dataframe 
dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                    columns = countvec.get_feature_names_out()) 


# Sum and organize ngram frequencies
df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)


### feature names
feature_names = df_ngram.head(10)['ngrams'].values
print(feature_names)

### plot 
sns.barplot(x="ngrams", 
            y = "freq",
            data=df_ngram[0:25])
plt.xticks(rotation=90)
plt.show()

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : WORDS COUNT
###' _ bigrams, trigrams
###'
###'


### most frequent bigrams 
countvec = CountVectorizer(min_df = 3, ngram_range=(3,4))


### fit and transform on tokens
ngrams = countvec.fit_transform(df_S['Abstract_join'])      


### create a dataframe 
dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                    columns = countvec.get_feature_names_out()) 


# Sum and organize ngram frequencies
df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)


### feature names
feature_names = df_ngram.head(10)['ngrams'].values
print(feature_names)

### plot 
sns.barplot(x="ngrams", 
            y = "freq",
            data=df_ngram[0:25])
plt.xticks(rotation=90)
plt.show()

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : WORDS COUNT
###' _ bigrams, trigrams
###'
###'


### most frequent bigrams 
countvec = CountVectorizer(min_df = 5, ngram_range=(3,4))


### fit and transform on tokens
ngrams = countvec.fit_transform(df_S['Full_join'])      


### create a dataframe 
dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                    columns = countvec.get_feature_names_out()) 


# Sum and organize ngram frequencies
df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)


### feature names
feature_names = df_ngram.head(10)['ngrams'].values
print(feature_names)

### plot 
sns.barplot(x="ngrams", 
            y = "freq",
            data=df_ngram[0:25])
plt.xticks(rotation=90)
plt.show()

# 더 넓은 period 설정해서 추출한 논문 random 추출해서 3가지 비교

## frequency 추출은 2단어가 제일 나은듯 함
## title이 제일 별로인거 같고 < 초록 < ? 전문 인듯?

## 한데 topic modeling 끝까지 해봐야 할듯

## keyword로 clustering하고 각 클러스터별 topic modeling?

## topic modeling, 키워드, clustering 차이 확인
## ? 까먹음

### 아닌듯 키워드는 초록으로, topic modeling은 전문으로?

연구문제

키워드 추출하는 방법
1. 각 논문 별 word frequncy top N개 추출 -> word embadding -> PCA

- unsupervised
2. title/abstract/fulltext 바탕으로 elbow K보고 clustering 나눠서 -> 클러스터별로 title/abstract/fulltext 바탕으로 topic modeling 결과 + 출판 년도 확인  
3. overlab되는 부분이 있을 때는 어떤 clustering 방법이 최선???  spectral / hierarchical 찾아보기, clustering에도 앙상블 있나?
4. 학회지 / 년도 나눠서 cluster 결과 확인ㄱㄴ

- supervised
4. 기간 나눠서 -> 기간 별 topic modeling 결과 확인
4. 기간 나눠서 -> 기간을 잘 구별하는 topic이 뭔지 neuralnetwork/ensemble 돌리기?


open source에는 word2vec 써서 단어-단어=? 보여주는 창 있어도 ㄱㅊ을듯

## 3.2. Word Clouds

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : Word Cloud
###' by. party
###'
###'

### 0. Sample from Whole data

# apply function to text object
TO_text = ' '.join(df_S['Title_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else ""))

# create WordCloud visualization using the "text" object 
TO_wordcloud = WordCloud(background_color = "white",
                      random_state=41).generate(TO_text)          

# plot 
plt.imshow(TO_wordcloud,
           interpolation = 'bilinear')
plt.axis('off')                       
plt.show()       

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : Word Cloud
###' by. party
###'
###'

### 0. Sample from Whole data

# apply function to text object
TO_text = ' '.join(df_S['Abstract_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else ""))

# create WordCloud visualization using the "text" object 
TO_wordcloud = WordCloud(background_color = "white",
                      random_state=41).generate(TO_text)          

# plot 
plt.imshow(TO_wordcloud,
           interpolation = 'bilinear')
plt.axis('off')                       
plt.show()

In [None]:
###' ################################################################################
###'
###' VISUALIZATION : Word Cloud
###' by. party
###'
###'

### 0. Sample from Whole data

# apply function to text object
TO_text = ' '.join(df_S['Full_tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else ""))

# create WordCloud visualization using the "text" object 
TO_wordcloud = WordCloud(background_color = "white",
                      random_state=41).generate(TO_text)          

# plot 
plt.imshow(TO_wordcloud,
           interpolation = 'bilinear')
plt.axis('off')                       
plt.show()       

# 3.3. KeyWords with WordsFrequency

In [None]:
df_S

In [None]:
key_freq = []

for _, row in df_S.iterrows():
   
    countvec = CountVectorizer(min_df=1, ngram_range=(2, 3))
    ngrams = countvec.fit_transform([row['Full_join']])
    
    dictionary_dataframe = pd.DataFrame(ngrams.todense(),
                                        columns = countvec.get_feature_names_out()) 
    df_ngram = pd.DataFrame(dictionary_dataframe.sum().reset_index()).rename(columns={'index': 'ngrams', 0: 'freq'})
    df_ngram = df_ngram.sort_values(by=['freq'], ascending=False).reset_index(drop=True)
    feature_names = ", ".join(df_ngram['ngrams'].values[:10] if len(df_ngram) >= 10 else df_ngram['ngrams'].values)
    key_freq.append(feature_names)

    
KEY1 = pd.DataFrame(key_freq, columns=['key_freq'])
KEY_F1 = df_S.join(KEY1)

# 결과 확인
KEY_F1.head(5)

In [None]:
top_5_docs = df_S.head(5)
top_5_freq_counts = []

for idx, row in top_5_docs.iterrows():
    # 해당 문서에서 추출된 최대 10개의 n-gram을 가져옴 (쉼표로 구분된 문자열이므로 분리)
    ngram_list = key_freq[idx].split(", ")
    
    # CountVectorizer에 vocabulary로 전달할 n-gram 리스트 설정
    countvec = CountVectorizer(ngram_range=(2, 3), vocabulary=ngram_list)
    
    # 해당 문서에서 n-gram 빈도수 계산
    ngrams = countvec.fit_transform([row['Full_join']])
    
    # 빈도수를 DataFrame으로 생성
    ngram_counts = pd.DataFrame(ngrams.toarray(), columns=countvec.get_feature_names_out())
    ngram_counts['document'] = idx  # 문서 인덱스 추가
    top_5_freq_counts.append(ngram_counts)

# 모든 문서의 빈도수를 하나의 DataFrame으로 결합
freq_counts_df = pd.concat(top_5_freq_counts).reset_index(drop=True)

# 결과 확인
freq_counts_df