In [1]:
import pandas as pd
# Import libraries for text preprocessing
import re
import nltk

# You only need to download these resources once. After you run this 
# the first time--or if you know you already have these installed--
# you can comment these two lines out (with a #)
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kinga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kinga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [104]:
#file that combines the metadata downoladed, the generated text and users data from the survey
df = pd.read_csv("text-metadata-survey.csv")
df = df.rename(columns={'text' :"col"})

In [15]:
#filtering for a specific characteristic of the users I am interested in
df = df[df['Concerned'] > 0]

In [111]:
df = df.drop_duplicates(subset=["col"])
df = df.dropna(subset=["col"])
df = df.dropna(subset=["video_id"])

In [20]:
df = df[df['language'] =='en']

addapted from https://nicharuc.github.io/topic_modeling/

### Preprocessing

In [121]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents([comment.split() for comment in df.col])
# Filter only those that occur at least 50 times
finder.apply_freq_filter(10)
bigram_scores = finder.score_ngrams(bigram_measures.pmi)

In [122]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = nltk.collocations.TrigramCollocationFinder.from_documents([comment.split() for comment in df.col])
# Filter only those that occur at least 50 times
finder.apply_freq_filter(10)
trigram_scores = finder.score_ngrams(trigram_measures.pmi)

In [116]:
bigram_pmi = pd.DataFrame(bigram_scores)
bigram_pmi.columns = ['bigram', 'pmi']
bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)

In [117]:
trigram_pmi = pd.DataFrame(trigram_scores)
trigram_pmi.columns = ['trigram', 'pmi']
trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)

In [73]:
# Filter for bigrams with only noun-type structures
def bigram_filter(bigram):
    tag = nltk.pos_tag(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if bigram[0] in stop_word_list or bigram[1] in stop_word_list:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram:
        return False
    return True

In [74]:
# Filter for trigrams with only noun-type structures
def trigram_filter(trigram):
    tag = nltk.pos_tag(trigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['JJ','NN']:
        return False
    if trigram[0] in stop_word_list or trigram[-1] in stop_word_list or trigram[1] in stop_word_list:
        return False
    if 'n' in trigram or 't' in trigram:
         return False
    if 'PRON' in trigram:
        return False
    return True 

In [118]:
titok_stopwords = ['fyp', "foryou", "foryoupage", "greenscreen", "dlaciebie", "fy", "duet", "stitch", "reply"]

path = "C:/Users/Kinga/Desktop/Research Master/Newsflow/Code/"
my_file = open(path + "polish.stopwords.txt", "r", encoding='utf-8')
polish_stopwords = my_file.read().split('\n')
stop_word_list = list(set(stopwords.words("dutch") + stopwords.words("english") + polish_stopwords + titok_stopwords))

In [123]:
# Can set pmi threshold to whatever makes sense - eyeball through and select threshold where n-grams stop making sense
# choose top 500 ngrams in this case ranked by PMI that have noun like structures
filtered_bigram = bigram_pmi[bigram_pmi.apply(lambda bigram:\
                                              bigram_filter(bigram['bigram'])\
                                              and bigram.pmi > 5, axis = 1)][:500]

filtered_trigram = trigram_pmi[trigram_pmi.apply(lambda trigram: \
                                                 trigram_filter(trigram['trigram'])\
                                                 and trigram.pmi > 5, axis = 1)][:500]


bigrams = [' '.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]
trigrams = [' '.join(x) for x in filtered_trigram.trigram.values if len(x[0]) > 2 or len(x[1]) > 2 and len(x[2]) > 2]

In [79]:
# Concatenate n-grams
def replace_ngram(x):
    for gram in trigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    return x

In [80]:
rl = df[["col"]]

In [81]:
rl.col = rl.col.map(lambda x: replace_ngram(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [82]:
# Pre-process dataset to get a cleaned and normalised text corpus
corpus = []
df['word_count'] = rl.col.apply(lambda x: len(str(x).split(" ")))
ds_count = len(df['word_count'])
for i in range(0, ds_count):
    # Remove punctuation
    text = re.sub('[^a-zA-Z]', ' ', str(rl.col.iloc[i]))
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # Remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # Convert to list from string
    text = text.split()
    
    # Stemming
    ps=PorterStemmer()
    
    # Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_word_list] 
    corpus.append(text)

rl.col = corpus

In [84]:
# Filter for only nouns
def noun_only(x):
    pos_comment = nltk.pos_tag(x)
    filtered = [word[0] for word in pos_comment if word[1] in ['NN']]
    # to filter both noun and verbs
    #filtered = [word[0] for word in pos_comment if word[1] in ['NN','VB', 'VBD', 'VBG', 'VBN', 'VBZ']]
    return filtered

In [85]:
corpus = rl.col.map(noun_only)

----

In [86]:
text = []
for d in corpus:
    text.append(' '.join(d))

### Model

https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [87]:
import bitermplus as btm

In [88]:
import bitermplus as btm
import numpy as np
import pandas as pd

In [89]:

# IMPORTING DATA

texts = text

# PREPROCESSING
# Obtaining terms frequency in a sparse matrix and corpus vocabulary
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
tf = np.array(X.sum(axis=0)).ravel()
# Vectorizing documents
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
docs_lens = list(map(len, docs_vec))
# Generating biterms
biterms = btm.get_biterms(docs_vec)

# INITIALIZING AND RUNNING MODEL
model = btm.BTM(
    X, vocabulary, seed=12321, T= 10, M=10, alpha=50/8, beta=0.01)
model.fit(biterms, iterations=60)
p_zd = model.transform(docs_vec)

# METRICS
perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 10)
coherence = btm.coherence(model.matrix_topics_words_, X, M=10)
# or
perplexity = model.perplexity_
coherence = model.coherence_

# LABELS
model.labels_
# or
btm.get_docs_top_topic(texts, model.matrix_docs_topics_)

100%|██████████| 60/60 [00:00<00:00, 2314.65it/s]
100%|██████████| 55/55 [00:00<00:00, 18384.34it/s]


Unnamed: 0,documents,label
0,mariupol odessa news russianplane law russiavs...,6
1,childhood damage,3
2,russian,1
3,moment everything oldcat bond,3
4,war height internationalwomensday christianity,5
5,senator kennedy embarrassment shame johnkenned...,3
6,president zelenskiy government consider sex pa...,9
7,sound,1
8,plu mm year anatomy anatomylesson traditionala...,7
9,russianinvasion slavaukraine foodinsecurity wh...,7


### Visualization

In [90]:
import tmplot as tmp

In [91]:
#visualsisation
tmp.report(model=model, docs=texts)

VBox(children=(VBox(children=(HBox(children=(HTML(value='<b>Select a topic</b>:'), Dropdown(options=((0, 0), (…