In [None]:
!python -m pip install --user --upgrade pip
!pip install tmtoolkit  #installing the tmtoolkit
!pip install pyLDAvis
!pip install statsmodels --upgrade
!pip install pandas --upgrade

!pip install sentence-transformers 

In [2]:
import warnings # import the warnings
warnings.filterwarnings("ignore") #ignoring the warnings


import tmtoolkit
import numpy as np # importing the numpy
import pandas as pd #importing the pandas
import re, nltk, gensim # importing re, nltk, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation # importing lda using sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer #importing Count and Tfidf
from sentence_transformers import SentenceTransformer #importing senetence transfomer



# Plotting tools
import pyLDAvis #importing pyldavis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt #importing the matplot
%matplotlib inline

#performing preprocessing on data
import string  #importing the string 
from nltk.corpus import wordnet #wordnet
from nltk.tokenize import word_tokenize #importing the word_tokenize
from nltk.stem import WordNetLemmatizer #importing the WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


  from collections import Iterable


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#reading the dataset 
df= pd.read_csv('all-data (1).csv',header=None,
                 names=['sentiment','News'],encoding='latin-1')

#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text)
    text=re.compile('[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+').sub('', text) 
    text=re.compile('((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text)
    
    return text


  
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)



#LEMMATIZATION
# Initialize the lemmatizer


from nltk.corpus import stopwords 
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)


def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))


In [6]:
df['Clean_text'] = df['News'].apply(lambda x: finalpreprocess(x))
df['Clean_text']

0       accord gran company plan move production russi...
1       technopolis plan develop stage area less squar...
2       international electronic industry company elco...
3       new production plant company would increase ca...
4       accord company update strategy year basware ta...
                              ...                        
4841    london marketwatch share price end lower londo...
4842    rinkuskiai beer sale fell per cent million lit...
4843    operating profit fell eur mn eur mn include ve...
4844    net sale paper segment decrease eur mn second ...
4845    sale finland decreased january sale outside fi...
Name: Clean_text, Length: 4846, dtype: object

In [7]:
df=df.drop_duplicates(subset = ['Clean_text'],keep = 'last').reset_index(drop = True)
from collections import OrderedDict

df['Desired'] = (df['Clean_text'].str.split().apply(lambda x: OrderedDict.fromkeys(x).keys()).str.join(' '))

In [8]:
df['Desired']=df['Desired'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')

# LDA with CountVectorizer

In [9]:
def C_V(X): #define function for for Countvectorizer
    vectorizer = CountVectorizer(analyzer='word',       
                                 min_df=10,                        # minimum reqd occurences of a word 
                                 stop_words='english',             # remove stop words
                                 lowercase=True,                   # convert all words to lowercase
                                 token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                                )

    data_vectorized = vectorizer.fit_transform(X)  #fit and transforming the text and converting into vectors
      # get all unique words in the corpus
    vocab = vectorizer.get_feature_names()

    return vectorizer,data_vectorized,vocab

In [10]:
import tmtoolkit #importing the tmtoolkit
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
def topic_model_coherence_generator(topic_num_start=2,    #def to calculate the coherence score
                                    topic_num_end=26,
                                    norm_corpus='',
                                    cv_matrix='',
                                    cv=''):
    norm_corpus_tokens = [doc.split() for doc in norm_corpus] #spliting the text 
    models = [] #empty list to store the model number
    coherence_scores = [] #empty list to store the score

    for i in range(topic_num_start, topic_num_end): #range to run the lda model
        print(i)
        cur_lda = LatentDirichletAllocation(n_components=i,  #defining the lda model
                                            random_state=0)
        cur_lda.fit_transform(cv_matrix) #fiting the lda with text
        cur_coherence_score = metric_coherence_gensim(  #calculating the coherence score for each model 
            measure='c_v',  # coherence
            top_n=5,  #top 5 
            topic_word_distrib=cur_lda.components_, #components of the model
            dtm=cv.fit_transform(norm_corpus), #fitting the the text 
            vocab=np.array(cv.get_feature_names()), #getting the feature names
            texts=norm_corpus_tokens) #text
        models.append(cur_lda) #appending the mosel numbers
        coherence_scores.append(np.mean(cur_coherence_score))# appending the mean of the coherence score
    return models, coherence_scores

In [11]:
def LDA_topic_modelling_CV(X):
    
    vectorizer,data_vectorized,vocab=C_V(X)
    
    
    
    models, coherence_scores = topic_model_coherence_generator(
                                                             norm_corpus=X, #text
                                                             cv=vectorizer, #CountVectorizer
                                                             cv_matrix=data_vectorized)#data from CountVectorizer
    ts=2
    te=26
    coherence_df = pd.DataFrame({ #creating a dataframe 
                               'TOPIC_NUMBER': [str(i) for i in range(ts, te)],
                               'COHERENCE_SCORE': np.round(coherence_scores, 4)})
    n=coherence_df.sort_values(by=["COHERENCE_SCORE"], ascending=False).head(1) 
    n_topics=int(list(n['TOPIC_NUMBER'].values)[0])
    score=round(n['COHERENCE_SCORE'].values[0],2)

    # Instantiate LDA Model with parameters
    lda_model = LatentDirichletAllocation(n_components=n_topics,               # Number of topics
                                        max_iter=10,               # Max learning iterations
                                        learning_method='online',   
                                        random_state=100,          # Random state
                                        batch_size=128,            # n docs in each learning iter
                                        evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                        n_jobs = -1,               # Use all available CPUs
                                        )
    doc_topic_matrix = lda_model.fit_transform(data_vectorized)


    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
    
    
    return n_topics,score,panel




In [12]:
%%time
n_topics,score,panel=LDA_topic_modelling_CV(df['Desired'])

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
CPU times: user 4min 17s, sys: 1.23 s, total: 4min 18s
Wall time: 4min 36s


In [13]:
n_topics,score

(10, 0.63)

In [14]:
panel

# LDA with TFVectorizer

In [15]:
def tfidf_V(X): #define function for for Countvectorizer
    vectorizer = TfidfVectorizer(analyzer='word',       
                                 min_df=10,                        # minimum reqd occurences of a word 
                                 stop_words='english',             # remove stop words
                                 lowercase=True,                   # convert all words to lowercase
                                 token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                                )

    data_vectorized = vectorizer.fit_transform(X)  #fit and transforming the text and converting into vectors
      # get all unique words in the corpus
    vocab = vectorizer.get_feature_names()

    return vectorizer,data_vectorized,vocab

In [16]:
import tmtoolkit #importing the tmtoolkit
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
def topic_model_coherence_generator(topic_num_start=2,    #def to calculate the coherence score
                                    topic_num_end=26,
                                    norm_corpus='',
                                    cv_matrix='',
                                    cv=''):
    norm_corpus_tokens = [doc.split() for doc in norm_corpus] #spliting the text 
    models = [] #empty list to store the model number
    coherence_scores = [] #empty list to store the score

    for i in range(topic_num_start, topic_num_end): #range to run the lda model
        #print(i)
        cur_lda = LatentDirichletAllocation(n_components=i,  #defining the lda model
                                            random_state=0)
        cur_lda.fit_transform(cv_matrix) #fiting the lda with text
        cur_coherence_score = metric_coherence_gensim(  #calculating the coherence score for each model 
            measure='c_v',  # coherence
            top_n=5,  #top 5 
            topic_word_distrib=cur_lda.components_, #components of the model
            dtm=cv.fit_transform(norm_corpus), #fitting the the text 
            vocab=np.array(cv.get_feature_names()), #getting the feature names
            texts=norm_corpus_tokens) #text
        models.append(cur_lda) #appending the mosel numbers
        coherence_scores.append(np.mean(cur_coherence_score))# appending the mean of the coherence score
    return models, coherence_scores

In [17]:
def LDA_topic_modelling_TF(X):
    
    vectorizer,data_vectorized,vocab=tfidf_V(X)
    
    
    
    models, coherence_scores = topic_model_coherence_generator(
                                                             norm_corpus=X, #text
                                                             cv=vectorizer, #tfidfVectorizer
                                                             cv_matrix=data_vectorized)#data from tfidfVectorizer
    ts=2
    te=26
    coherence_df = pd.DataFrame({ #creating a dataframe 
                               'TOPIC_NUMBER': [str(i) for i in range(ts, te)],
                               'COHERENCE_SCORE': np.round(coherence_scores, 4)})
    n=coherence_df.sort_values(by=["COHERENCE_SCORE"], ascending=False).head(1) 
    n_topics=int(list(n['TOPIC_NUMBER'].values)[0])
    score=round(n['COHERENCE_SCORE'].values[0],2)

    # Instantiate LDA Model with parameters
    lda_model = LatentDirichletAllocation(n_components=n_topics,        # Number of topics
                                        max_iter=10,               # Max learning iterations
                                        learning_method='online',   
                                        random_state=100,          # Random state
                                        batch_size=128,            # n docs in each learning iter
                                        evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                        n_jobs = -1,               # Use all available CPUs
                                        )
    doc_topic_matrix = lda_model.fit_transform(data_vectorized) #fitting the vector to lda
 

    pyLDAvis.enable_notebook()  #visualizing the  topics
    panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
    
    
    return n_topics,score,panel




In [18]:
%%time
n_topics,score,panel=LDA_topic_modelling_TF(df['Desired'])

CPU times: user 3min 4s, sys: 1.31 s, total: 3min 5s
Wall time: 3min 13s


In [19]:
n_topics,score

(7, 0.58)

In [20]:
panel