In [46]:
import sklearn
sklearn.__version__

'1.2.2'

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
import pandas as pd


## Import test dataframe
## delete unnecessary columns
## rename columns to fit assignment use cases
df = pd.read_csv('train.csv')
df = df[['TITLE', 'ABSTRACT']]
df = df.rename(columns={"TITLE": "title", "ABSTRACT": "text"})

# Defining Cleaning Steps

In [18]:
## TOKENISATION FROM GENSIMS

# Import function to convert a document into a list of tokens
# This lowercases, tokenizes, and optionally de-accents the text 
# The output are tokens (unicode strings)
from gensim.utils import simple_preprocess

# Function to tokenize the text
def tokenize(text):
    # Apply the simple_preprocess function
    text = simple_preprocess(str(text), deacc=True)
    return text

In [19]:
## STOPWORD REMOVAL

import nltk # Library with access to corpora and lexical resources
# Download the required files from nltk
nltk.download('stopwords')
# Module to access stopwords list
from nltk.corpus import stopwords
# Extract the list of english stopwords
stop_words = stopwords.words('english')

# Function to remove stopwords
def remove_stopword(text):
    tokens = []
    for token in text:
        if token not in stop_words:
            # If the token is not in the list of stopwords
            # then keep it, else discard it
            tokens.append(token)
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ivan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
import spacy # Library for NLP
# Load the spacy trained pipeline to tokenize the text
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Function to lemmatize the tokens
def lemmatize(tokens, allowed_postags=["NOUN", "ADJ", "VERB"]):
    text = " ".join(tokens)
    text = nlp(text)
    lemmatized_tokens = []
    
    for token in text:
        if token.pos_ in allowed_postags:
            lemmatized_tokens.append(token.lemma_)
    
    return lemmatized_tokens

# LDA using Scikit-Learn

Cleaning Steps:

1. Tokenisation
2. Stopword removal
3. Lemmatisation

In [32]:
# Tokenize the texts
df['tokenised_text'] = [tokenize(text) for text in df["text"]]

# Remove stopwords
df['removed_stopwords'] = [remove_stopword(text) for text in df['tokenised_text']]

# Lemmatize the text
df['lemmatise'] = [lemmatize(text) for text in df['removed_stopwords']]

# Combine the tokens to form a single string
df['clean_text'] = [" ".join(tokens) for tokens in df['lemmatise'] ]

In [33]:
df.head()

Unnamed: 0,title,text,tokenised_text,removed_stopwords,lemmatise,clean_text
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,"[predictive, models, allow, subject, specific,...","[predictive, models, allow, subject, specific,...","[predictive, model, allow, subject, specific, ...",predictive model allow subject specific infere...
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,"[rotation, invariance, and, translation, invar...","[rotation, invariance, translation, invariance...","[rotation, invariance, translation, invariance...",rotation invariance translation invariance gre...
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,"[we, introduce, and, develop, the, notion, of,...","[introduce, develop, notion, spherical, polyha...","[introduce, develop, notion, spherical, polyha...",introduce develop notion spherical polyharmoni...
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,"[the, stochastic, landau, lifshitz, gilbert, l...","[stochastic, landau, lifshitz, gilbert, llg, e...","[stochastic, landau, equation, couple, maxwell...",stochastic landau equation couple maxwell equa...
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,"[fourier, transform, infra, red, ftir, spectra...","[fourier, transform, infra, red, ftir, spectra...","[fouri, spectra, sample, plant, specie, use, e...",fouri spectra sample plant specie use explore ...


## Preprocessing Data

In [49]:
# Import function to convert a collection of text documents to a 
# matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the vectorizer
tf_vectorizer = CountVectorizer(
    strip_accents = 'unicode', stop_words = 'english', lowercase = True,
    token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.5, min_df = 10
)

# Fit the vectorizer and create the document term matrix for term freq
tf_doc_term_matrix = tf_vectorizer.fit_transform(df['clean_text'])

In [41]:
# Import function to convert a collection of raw documents to a 
# matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer by passing the parameters of
# the term freq vectorizer
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())

# Fit the vectorizer and create the document term matrix for TF-IDF
tfidf_doc_term_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

## Creating Model

In [42]:
# Import the module that provides the Latent Dirichlet Allocation algorithm
from sklearn.decomposition import LatentDirichletAllocation

# LDA model for term freq
# Initialize the LDA model
tf_lda_model = LatentDirichletAllocation(n_components=10, random_state=0)
# Fit the model
tf_lda_model.fit(tf_doc_term_matrix)

# LDA model for TF-IDF
# Initialize the LDA model
tfidf_lda_model = LatentDirichletAllocation(n_components=10, random_state=0)
# Fit the model
tfidf_lda_model.fit(tfidf_doc_term_matrix)

In [51]:
import pyLDAvis # Module for interactive topic model visualization

# Function to prepare the LDA model for visualization
from pyLDAvis.lda_model import prepare

# Visualise the term freq model
pyLDAvis_tf = prepare(tf_lda_model, tf_doc_term_matrix, tf_vectorizer)
pyLDAvis.display(pyLDAvis_tf)

In [52]:
# Visualise the TF-IDF model
pyLDAvis_tfidf = prepare(
    tfidf_lda_model, tfidf_doc_term_matrix, tfidf_vectorizer
)
pyLDAvis.display(pyLDAvis_tfidf)

In [55]:
df2 = pyLDAvis_tfidf.topic_info

In [57]:
## Print most frequent words to CSV
df2.to_csv('output.csv', index=False)