In [1]:
###' ################################################################################
###'
###' IMPORT LIBRARIES
###'
###'

### pandas and numpy
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt

### punctuation, stop words and English language model
from string import punctuation
from spacy.lang.en.stop_words import STOP_WORDS
from spellchecker import SpellChecker
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import en_core_web_sm
nlp = en_core_web_sm.load()
import scattertext as st

### textblob
from textblob import TextBlob

### countvectorizer, tfidfvectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### gensim
import gensim
from gensim import models

### plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### kMeans and silhouette scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

### ignore warnings
import warnings
warnings.filterwarnings('ignore')

###time
from datetime import datetime

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [9]:
###' ################################################################################
###'
###' Data Load 
###'
###'

### data load
df= pd.read_csv("data/7_124(2020-2024).csv")
df.head(5)

df_clean = df.dropna(subset = ['Title','Abstract'])

spell = SpellChecker()
df_clean['Title_spell'] = df_clean['Title'].map(lambda x: spell.correction(x))
df_clean['Abstract_spell'] = df_clean['Abstract'].map(lambda x: spell.correction(x))
df_clean.head(5)

Unnamed: 0,Authors,Author full names,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,...,Abstract,Author Keywords,Index Keywords,Document Type,Publication Stage,Open Access,Source,EID,Title_spell,Abstract_spell
0,Domingue B.W.; Kanopka K.; Stenhaug B.; Soland...,"Domingue, Benjamin W. (37103720900); Kanopka, ...",37103720900; 57207731590; 57214128465; 5596026...,Variation in Respondent Speed and its Implicat...,2021,Journal of Educational Measurement,58.0,3.0,,335.0,...,The more frequent collection of response time ...,,,Article,Final,All Open Access; Green Open Access,Scopus,2-s2.0-85108352505,Variation in Respondent Speed and its Implicat...,The more frequent collection of response time ...
1,Baldwin P.; Yaneva V.; Mee J.; Clauser B.E.; H...,"Baldwin, Peter (34867881100); Yaneva, Victoria...",34867881100; 57003253500; 23498092200; 7003595...,Using Natural Language Processing to Predict I...,2021,Journal of Educational Measurement,58.0,1.0,,4.0,...,"In this article, it is shown how item text can...",,,Article,Final,All Open Access; Green Open Access,Scopus,2-s2.0-85102484910,Using Natural Language Processing to Predict I...,"In this article, it is shown how item text can..."
2,Lim H.; Choe E.M.; Han K.T.,"Lim, Hwanggyu (57216896756); Choe, Edison M. (...",57216896756; 57197771137; 19638651100,A Residual-Based Differential Item Functioning...,2022,Journal of Educational Measurement,59.0,1.0,,80.0,...,Differential item functioning (DIF) of test it...,,,Article,Final,,Scopus,2-s2.0-85127265688,A Residual-Based Differential Item Functioning...,Differential item functioning (DIF) of test it...
3,Liao X.; Bolt D.M.; Kim J.-S.,"Liao, Xiangyi (57222607143); Bolt, Daniel M. (...",57222607143; 57223443138; 8849255200,Curvilinearity in the Reference Composite and ...,2024,Journal of Educational Measurement,,,,,...,Item difficulty and dimensionality often corre...,,,Article,Article in press,All Open Access; Hybrid Gold Open Access,Scopus,2-s2.0-85195021316,Curvilinearity in the Reference Composite and ...,Item difficulty and dimensionality often corre...
4,Kim S.Y.; Lee W.-C.,"Kim, Stella Y. (57207794797); Lee, Won-Chan (5...",57207794797; 57203094500,Several Variations of Simple-Structure MIRT Eq...,2023,Journal of Educational Measurement,60.0,1.0,,76.0,...,The current study proposed several variants of...,,,Article,Final,All Open Access; Hybrid Gold Open Access,Scopus,2-s2.0-85135020339,Several Variations of Simple-Structure MIRT Eq...,The current study proposed several variants of...


In [None]:
###' ################################################################################
###'
###' Function for Deleteing Punctuations and StopWords
###'
###'

### define fuction
def rem_punc_stop(text):
    
    # when text is None
    if text is None:  
        return None
    
    # add more stop words
    stop_words = STOP_WORDS 
    nlp.Defaults.stop_words |= {"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
                    "january", "february", "march", "april", "may", "june", "july", "august", "september", 
                    "october", "november", "december", "time", "today", "yesterday", "tomorrow", 
                    "day", "week", "month", "year","amp","RT","QT"}
    
    punc = set(punctuation)
    
    
    # remove the punctuation
    punc_free = "".join([ch for ch in text if ch not in punc])                                                               

    # apply nlp to punctuation-free object
    doc = nlp(punc_free)
    
    # lemmatize
    text_lemma = " ".join([word.lemma_ for word in doc])
    
    # extract words from processed text 
    spacy_words = text_lemma.split()
    spacy_words_1 = [word for word in spacy_words if not word.startswith("@")]
    spacy_words_2 = [word for word in spacy_words_1 if not word.startswith("http")]
    
    
    # filter out words 
    no_stop = [word for word in spacy_words_2 if word not in stop_words]
    
    # return
    return no_stop

In [None]:
###' ################################################################################
###'
###' Apply the Function and Tokenize Text Column
###'
###'

df_clean['tokens'] = df_clean['Abstract_spell'].map(lambda x: rem_punc_stop(x))

df_clean['tokens']

# apply function to text object
TO_text = ' '.join(df_clean['tokens'].map(lambda text: ' '.join(text) if isinstance(text, list) else ""))

# create WordCloud visualization using the "text" object 
TO_wordcloud = WordCloud(background_color = "white",
                      random_state=41              
                      ).generate(TO_text)          

# plot 
plt.imshow(TO_wordcloud,
           interpolation = 'bilinear')
plt.axis('off')                       
plt.show()   