In [1]:
import json
import pandas as pd
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import umap
import hdbscan
import nltk
nltk.download('punkt')
pd.set_option('display.width', 1000)

%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=3


[nltk_data] Downloading package punkt to /home/iharsawi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Original

In [2]:
# Load JSON files
with open('transcripts_BEFORE_COVID.json', 'r') as file:
    before_covid_data = json.load(file)

with open('transcripts_AFTER_COVID.json', 'r') as file:
    after_covid_data = json.load(file)

# Convert to DataFrame
before_covid_df = pd.json_normalize(before_covid_data)
after_covid_df = pd.json_normalize(after_covid_data)

print(before_covid_df.shape)
print(after_covid_df.shape)

# Extract necessary columns
texts_before = before_covid_df[['transcripts', 'channel.ideology']]
texts_after = after_covid_df[['transcripts', 'channel.ideology']]

# Get unique ideologies
ideologies = texts_before['channel.ideology'].unique()

(5714, 21)
(5726, 21)


## Load Summarized

In [3]:
# Load JSON files
with open('summarized_BEFORE_COVID.json', 'r') as file:
    before_covid_data_summarized = json.load(file)

with open('summarized_AFTER_COVID.json', 'r') as file:
    after_covid_data_summarized = json.load(file)

# Convert to DataFrame
before_covid_df_summarized = pd.json_normalize(before_covid_data_summarized)
after_covid_df_summarized = pd.json_normalize(after_covid_data_summarized)

print(before_covid_df_summarized.shape)
print(after_covid_df_summarized.shape)

# Extract necessary columns
texts_before_summarized = before_covid_df_summarized[['summary', 'channel.ideology']]
texts_after_summarized = after_covid_df_summarized[['summary', 'channel.ideology']]

(5714, 22)
(5726, 22)


## Number of Tokens

In [4]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Function to calculate average token size
def get_average_token_size(texts, tokenizer):
    total_tokens = 0
    total_texts = len(texts)
    
    for text in texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        total_tokens += len(tokens)
    
    average_size = total_tokens / total_texts if total_texts > 0 else 0
    return average_size

# Calculate average token size for texts_before and texts_after
average_token_size_before = get_average_token_size(texts_before['transcripts'], tokenizer)
average_token_size_after = get_average_token_size(texts_after['transcripts'], tokenizer)
average_token_size_before_summarized = get_average_token_size(texts_before_summarized['summary'], tokenizer)
average_token_size_after_summarized = get_average_token_size(texts_after_summarized['summary'], tokenizer)

print(f'Average token size for texts_before: {average_token_size_before}')
print(f'Average token size for texts_after: {average_token_size_after}')
print(f'Average token size for texts_before_summarized: {average_token_size_before_summarized}')
print(f'Average token size for texts_after_summarized: {average_token_size_after_summarized}')

# Function to calculate max token size
def get_max_token_size(texts, tokenizer):
    max_size = 0
    for text in texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        max_size = max(max_size, len(tokens))
    return max_size

# Calculate max token size for texts_before and texts_after
max_token_size_before = get_max_token_size(texts_before['transcripts'], tokenizer)
max_token_size_after = get_max_token_size(texts_after['transcripts'], tokenizer)
max_token_size_before_summarized = get_max_token_size(texts_before_summarized['summary'], tokenizer)
max_token_size_after_summarized = get_max_token_size(texts_after_summarized['summary'], tokenizer)

print(f'Max token size for texts_before: {max_token_size_before}')
print(f'Max token size for texts_after: {max_token_size_after}')
print(f'Max token size for texts_before_summarized: {max_token_size_before_summarized}')
print(f'Max token size for texts_after_summarized: {max_token_size_after_summarized}')

Token indices sequence length is longer than the specified maximum sequence length for this model (2065 > 512). Running this sequence through the model will result in indexing errors


Average token size for texts_before: 1819.2051102555129
Average token size for texts_after: 1841.8499825358017
Average token size for texts_before_summarized: 247.6641582079104
Average token size for texts_after_summarized: 246.74886482710443
Max token size for texts_before: 5605
Max token size for texts_after: 6016
Max token size for texts_before_summarized: 452
Max token size for texts_after_summarized: 461


## Filter

In [5]:
texts_before = texts_before[texts_before['channel.ideology'] == "BLACK"]
texts_after = texts_after[texts_after['channel.ideology'] == "BLACK"]
texts_before_summarized = texts_before_summarized[texts_before_summarized['channel.ideology'] == "BLACK"]
texts_after_summarized = texts_after_summarized[texts_after_summarized['channel.ideology'] == "BLACK"]

# List of ideologies
ideologies = ["BLACK"]

## BASELINE

In [6]:
# Function to train BERTopic with specific configurations
def train_bertopic_for_ideology(texts, ideology):
    ideology_texts = texts[texts['channel.ideology'] == ideology]['transcripts'].tolist()
    print(f"Training model for ideology: {ideology} with {len(ideology_texts)} texts.")
    topic_model = BERTopic(
        # verbose=True
    )
    topics, probs = topic_model.fit_transform(ideology_texts)
    return topic_model, topics, probs

# Dictionary to store models and topics
models_before = {}
models_after = {}

for ideology in ideologies:
    try:
        print("Before COVID")
        model_before, topics_before, probs_before = train_bertopic_for_ideology(
            texts_before, ideology)
        models_before[ideology] = (model_before, topics_before, probs_before)
        
        print("After COVID")
        model_after, topics_after, probs_after = train_bertopic_for_ideology(
            texts_after, ideology)
        models_after[ideology] = (model_after, topics_after, probs_after)
    except Exception as e:
        print(f"Error occurred for ideology {ideology}: {e}")

# Analyze and compare topics
for ideology in ideologies:
    model_before, topics_before, _ = models_before[ideology]
    model_after, topics_after, _ = models_after[ideology]
    
    # Get topic representations
    topics_info_before = model_before.get_topic_info()
    topics_info_after = model_after.get_topic_info()
    
    print(f"\nTopic Representations Before COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_before)}")
    display(topics_info_before)

    print(f"\nTopic Representations After COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_after)}")
    display(topics_info_after)


Before COVID
Training model for ideology: BLACK with 454 texts.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


After COVID
Training model for ideology: BLACK with 464 texts.

Topic Representations Before COVID for BLACK:

Number of Topic: 10


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,183,-1_the_and_to_you,"[the, and, to, you, of, that, in, is, it, this]",[hey everybody how you doing? i'm dr boyce wa...
1,0,60,0_to_and_the_that,"[to, and, the, that, you, she, her, this, of, ...",[I cannot believe. But the tenant already liv...
2,1,52,1_you_so_the_to,"[you, so, the, to, and, is, in, of, like, its]","[Hey guys, welcome back to my channel. My nam..."
3,2,41,2_the_and_of_to,"[the, and, of, to, that, is, you, in, this, they]","[unleash the nihilists and the atheists, and w..."
4,3,37,3_and_to_the_black,"[and, to, the, black, that, in, they, of, you,...",[Here we are back on the Connecting the Dots Y...
5,4,29,4_the_you_to_of,"[the, you, to, of, what, and, that, is, right,...",[Come over here sister. Come talk to me siste...
6,5,19,5_to_the_and_you,"[to, the, and, you, that, on, of, im, out, in]","[What's going on, guys? Get these white glass..."
7,6,13,6_you_and_to_know,"[you, and, to, know, that, the, it, chick, get...",[what's happening. fam lar movement still mov...
8,7,10,7_and_the_to_in,"[and, the, to, in, was, you, that, this, lord,...","[Hi guys, this is your sister Karen Gidden in ..."
9,8,10,8_to_the_that_you,"[to, the, that, you, of, and, is, it, they, have]","[B-1 Brigadiers, it should come as no surprise..."



Topic Representations After COVID for BLACK:

Number of Topic: 4


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,42,-1_the_and_to_you,"[the, and, to, you, that, of, is, in, this, it]","[In the upcoming presentation, we will be disc..."
1,0,228,0_to_the_and_you,"[to, the, and, you, that, of, in, this, is, it]",[If you are not familiar with the concepts of ...
2,1,136,1_the_and_to_you,"[the, and, to, you, of, that, is, in, it, this]","[Good rising brethren, this is Big Judah comin..."
3,2,58,2_the_to_and_of,"[the, to, and, of, you, in, that, we, is, are]",[hey guys welcome back to my channel. thank y...


## Consider basic "pre-processing tricks + cTF-IDF"

In [7]:
# Function to train BERTopic with specific configurations
def train_bertopic_for_ideology(texts, ideology):
    ideology_texts = texts[texts['channel.ideology'] == ideology]['transcripts'].tolist()
    print(f"Training model for ideology: {ideology} with {len(ideology_texts)} texts.")
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    topic_model = BERTopic(
        # verbose=True
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
    )
    topics, probs = topic_model.fit_transform(ideology_texts)
    return topic_model, topics, probs

# Dictionary to store models and topics
models_before = {}
models_after = {}

for ideology in ideologies:
    try:
        print("Before COVID")
        model_before, topics_before, probs_before = train_bertopic_for_ideology(
            texts_before, ideology)
        models_before[ideology] = (model_before, topics_before, probs_before)
        
        print("After COVID")
        model_after, topics_after, probs_after = train_bertopic_for_ideology(
            texts_after, ideology)
        models_after[ideology] = (model_after, topics_after, probs_after)
    except Exception as e:
        print(f"Error occurred for ideology {ideology}: {e}")

# Analyze and compare topics
for ideology in ideologies:
    model_before, topics_before, _ = models_before[ideology]
    model_after, topics_after, _ = models_after[ideology]
    
    # Get topic representations
    topics_info_before = model_before.get_topic_info()
    topics_info_after = model_after.get_topic_info()
    
    print(f"\nTopic Representations Before COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_before)}")
    display(topics_info_before)

    print(f"\nTopic Representations After COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_after)}")
    display(topics_info_after)


Before COVID
Training model for ideology: BLACK with 454 texts.
After COVID
Training model for ideology: BLACK with 464 texts.

Topic Representations Before COVID for BLACK:

Number of Topic: 10


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,151,-1_know_dont_going_im,"[know, dont, going, im, like, just, people, th...",[It was really bad. That was the whole focus ...
1,0,62,0_police_know_dont_just,"[police, know, dont, just, like, im, going, sh...","[Hey, family. Happy Friday. This is starting..."
2,1,57,1_africa_oh_yeah_oh oh,"[africa, oh, yeah, oh oh, thank, oh oh oh, lik...",[Please subscribe. Comment down below. Smash...
3,2,54,2_high_people_going_right,"[high, people, going, right, bethel, know, jus...","[Hey, this is JT. Another episode about life ..."
4,3,43,3_black_white_folk_people,"[black, white, folk, people, black folk, black...",[Hello. Welcome again to Connecting the Dots....
5,4,28,4_right_bible_god_read,"[right, bible, god, read, verse, man, thats, p...","[Look, there's the Black Holocaust. I knew it..."
6,5,23,5_yeah_yall_whats_im,"[yeah, yall, whats, im, shit, going, got, man,...",[yo hello all right we're here. what's going ...
7,6,12,6_chick_lame_know_lame chick,"[chick, lame, know, lame chick, like, dont, ju...","[You know, the Me Too movement have turned a l..."
8,7,12,7_vision_lord_beyonce_brothers sisters,"[vision, lord, beyonce, brothers sisters, sist...","[Just with this quick update, Hi guys, this is..."
9,8,12,8_bank america_bank_youtube_respect,"[bank america, bank, youtube, respect, white, ...","[Forgive me, brothers. It has been over a wee..."



Topic Representations After COVID for BLACK:

Number of Topic: 5


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,54,-1_jehovahs_know_going_witnesses,"[jehovahs, know, going, witnesses, im, royal, ...",[How's the royal family? I pray that everyone...
1,0,205,0_black_like_im_dont,"[black, like, im, dont, know, just, going, yal...",[hey yo what up youtube y'all know who it is. ...
2,1,136,1_right_lord_know_going,"[right, lord, know, going, people, thats, like...","[Good rising, brethren. This is Big Judah, co..."
3,2,57,2_africa_african_chinese_people,"[africa, african, chinese, people, thank, know...",[Otherwise you are going to ask and then answe...
4,3,12,3_know_son_kids_like,"[know, son, kids, like, family, dad, want, jus...","[What's happening, fam? LA y'all movement sti..."


## Consider "fine-tuning"

In [8]:
# Function to train BERTopic with specific configurations
def train_bertopic_for_ideology(texts, ideology):
    ideology_texts = texts[texts['channel.ideology'] == ideology]['transcripts'].tolist()
    print(f"Training model for ideology: {ideology} with {len(ideology_texts)} texts.")
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    keybert = KeyBERTInspired()
    mmr = MaximalMarginalRelevance(diversity=0.3)
    representation_model = {
        "KeyBERT": keybert,
        "MMR": mmr,
    }
    topic_model = BERTopic(
        # verbose=True
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model,
        # Hyperparameters
        top_n_words=10,
    )
    topics, probs = topic_model.fit_transform(ideology_texts)
    return topic_model, topics, probs

# Dictionary to store models and topics
models_before = {}
models_after = {}

for ideology in ideologies:
    try:
        print("Before COVID")
        model_before, topics_before, probs_before = train_bertopic_for_ideology(
            texts_before, ideology)
        models_before[ideology] = (model_before, topics_before, probs_before)
        
        print("After COVID")
        model_after, topics_after, probs_after = train_bertopic_for_ideology(
            texts_after, ideology)
        models_after[ideology] = (model_after, topics_after, probs_after)
    except Exception as e:
        print(f"Error occurred for ideology {ideology}: {e}")

# Analyze and compare topics
for ideology in ideologies:
    model_before, topics_before, _ = models_before[ideology]
    model_after, topics_after, _ = models_after[ideology]
    
    # Get topic representations
    topics_info_before = model_before.get_topic_info()
    topics_info_after = model_after.get_topic_info()
    
    print(f"\nTopic Representations Before COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_before)}")
    display(topics_info_before)

    print(f"\nTopic Representations After COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_after)}")
    display(topics_info_after)


Before COVID
Training model for ideology: BLACK with 454 texts.
After COVID
Training model for ideology: BLACK with 464 texts.

Topic Representations Before COVID for BLACK:

Number of Topic: 10


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,150,-1_just_know_like_dont,"[just, know, like, dont, going, im, people, th...","[bethel, basically, place, family, yall, bad, ...","[like, dont, im, thats, black, time, look, say...",[This is your brother Malcolm coming at you wi...
1,0,62,0_police_know_dont_im,"[police, know, dont, im, just, like, going, th...","[ashley, jay, situation, shes, investigation, ...","[police, like, black, shes, think, family, did...",[How did you know they were fleeing? Good det...
2,1,61,1_high_people_going_shall,"[high, people, going, shall, right, book, chap...","[scripture, scriptures, chapter verse, verse, ...","[high, verse, bible, read, messiah, god, truth...","[Thy kingdom come, thy will be done, on earth ..."
3,2,52,2_africa_oh_yeah_oh oh,"[africa, oh, yeah, oh oh, oh oh oh, thank, lik...","[blacksit family, blacksit, come africa, afric...","[africa, oh oh oh, ghana, african, say, nigeri...",[Please subscribe. Comment down below. Smash...
4,3,37,3_black_white_folk_black folk,"[black, white, folk, black folk, black people,...","[blacks, black folk, white supremacy, black pe...","[black folk, black people, vote, race, constit...",[Hello. Welcome again to Connecting the Dots....
5,4,28,4_god_right_bible_read,"[god, right, bible, read, verse, man, thats, p...","[jeremiah, tribe, according bible, bible says,...","[god, bible, verse, israel, chapter, white, sa...","[Look, there's the Black Holocaust. I knew it..."
6,5,25,5_shut shut_shut shut shut_shut_yeah,"[shut shut, shut shut shut, shut, yeah, yall, ...","[whats going, door, talking, yall, yeah im, ge...","[shut shut shut, yall, im, shit, like, monday ...",[yo hello all right we're here. what's going ...
7,6,16,6_chick_know_women_lame,"[chick, know, women, lame, like, men, lame chi...","[just lame chick, lame chick, women, everybody...","[chick, women, lame, like, lame chick, theyre,...",[That's why the pimps say hoes come going beca...
8,7,13,7_vision_lord_brothers sisters_sisters,"[vision, lord, brothers sisters, sisters, beyo...","[sister karen, vision lord, pastor, amen bless...","[lord, brothers sisters, benny, benny hinn, ch...","[Hello, this is Risa Sukhari again in Jesus Ch..."
9,8,10,8_bank america_bank_youtube_respect,"[bank america, bank, youtube, respect, white, ...","[white supremacists, white supremacist, white ...","[bank america, bank, youtube, respect, black, ...","[B-1 Brigadiers, it should come as no surprise..."



Topic Representations After COVID for BLACK:

Number of Topic: 8


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,96,-1_know_just_im_people,"[know, just, im, people, like, dont, going, je...","[jehovahs witness, jehovahs witnesses, jehovah...","[im, dont, jehovahs, family, witnesses, theyre...",[Some in the community may feel uneasy as we a...
1,0,105,0_right_lord_going_know,"[right, lord, going, know, people, thats, oh, ...","[verse, pray, israelites, judah, shalom, bible...","[israel, high, lord oh lord, god, shall, verse...","[McCurran boys! Father in the sky, I shed tea..."
2,1,78,1_im_like_know_yall,"[im, like, know, yall, yeah, got, police, shit...","[officer, police, cop, officers, cops, attorne...","[like, yall, police, shit, car, niggas, home, ...",[I told y'all that when I get hot and I get to...
3,2,74,2_black_white_women_men,"[black, white, women, men, people, dont, man, ...","[black community, white supremacy, black men, ...","[black, man, black men, mean, black women, wan...",[hey yo what up youtube y'all know who it is. ...
4,3,61,3_africa_african_chinese_thank,"[africa, african, chinese, thank, people, know...","[african countries, africans, africa, african,...","[africa, african, chinese, continent, countrie...",[Hello. Welcome to Connecting the Dots on the...
5,4,22,4_brothers sisters_sisters_vision_brothers,"[brothers sisters, sisters, vision, brothers, ...","[visions, darkness, sister karen, prayer, days...","[brothers sisters, spirit, clay, said, amen, k...","[Hi guys, this is your sister Karen Gidden in ..."
6,5,14,5_know_like_kids_son,"[know, like, kids, son, dont, family, dad, wan...","[new age family, age family, mothers, family, ...","[like, kids, family, say, father, mother, age,...","[What's happening, fam? LA y'all movement sti..."
7,6,14,6_women_want_youre_woman,"[women, want, youre, woman, shes, traditional,...","[inner beauty movement, want women, modern wom...","[women, shes, traditional, married, feminine, ...","[Hello, I'm Nicole Michelle, founder and femin..."


## all-mpnet-base-v2

In [13]:
# Function to train BERTopic with specific configurations
def train_bertopic_for_ideology(texts, ideology):
    ideology_texts = texts[texts['channel.ideology'] == ideology]['transcripts'].tolist()
    print(f"Training model for ideology: {ideology} with {len(ideology_texts)} texts.")
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    keybert = KeyBERTInspired()
    mmr = MaximalMarginalRelevance(diversity=0.3)
    representation_model = {
        "KeyBERT": keybert,
        "MMR": mmr,
    }
    topic_model = BERTopic(
        # verbose=True
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model,
        embedding_model="all-mpnet-base-v2",
        # Hyperparameters
        top_n_words=10,
    )
    topics, probs = topic_model.fit_transform(ideology_texts)
    return topic_model, topics, probs

# Dictionary to store models and topics
models_before = {}
models_after = {}

for ideology in ideologies:
    try:
        print("Before COVID")
        model_before, topics_before, probs_before = train_bertopic_for_ideology(
            texts_before, ideology)
        models_before[ideology] = (model_before, topics_before, probs_before)
        
        print("After COVID")
        model_after, topics_after, probs_after = train_bertopic_for_ideology(
            texts_after, ideology)
        models_after[ideology] = (model_after, topics_after, probs_after)
    except Exception as e:
        print(f"Error occurred for ideology {ideology}: {e}")

# Analyze and compare topics
for ideology in ideologies:
    model_before, topics_before, _ = models_before[ideology]
    model_after, topics_after, _ = models_after[ideology]
    
    # Get topic representations
    topics_info_before = model_before.get_topic_info()
    topics_info_after = model_after.get_topic_info()
    
    print(f"\nTopic Representations Before COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_before)}")
    display(topics_info_before)

    print(f"\nTopic Representations After COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_after)}")
    display(topics_info_after)


Before COVID
Training model for ideology: BLACK with 454 texts.
After COVID
Training model for ideology: BLACK with 464 texts.

Topic Representations Before COVID for BLACK:

Number of Topic: 9


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,99,-1_like_don_black_just,"[like, don, black, just, people, going, know, ...","[relationship, episode, love, 90, family, vide...","[like, black, just, people, hair, want, time, ...",[hey guys so i wanted to come on here and talk...
1,0,68,0_police_know_don_just,"[police, know, don, just, going, people, like,...","[police, investigation, cops, killed, officer,...","[police, know, like, black, video, look, didn,...",[How did you know they were fleeing? Good det...
2,1,65,1_high_going_people_know,"[high, going, people, know, just, said, shall,...","[prophecy, prophecies, messiah, scriptures, ho...","[high, know, shall, spirit, lord, verse, messi...",[All esteem to the Most High Elohim. This is ...
3,2,50,2_africa_thank_yeah_ghana,"[africa, thank, yeah, ghana, like, know, come,...","[come africa, blacksit family, africans, afric...","[africa, ghana, come, nigeria, say, kenya, lov...",[yeah yeah we make home i make homemade ice cr...
4,3,44,3_black_folk_black folk_white,"[black, folk, black folk, white, people, right...","[civil rights, blacks, slavery, white supremac...","[black, black folk, rights, black people, coun...","[Hey, what's up guys? I'm Dr. Boyce Watkins ..."
5,4,37,4_bible_right_god_read,"[bible, right, god, read, verse, man, people, ...","[according bible, deuteronomy, israelites, bib...","[bible, read, verse, come, okay, know, white, ...",[He speaks smooth things to you. This is what...
6,5,33,5_yeah_shit_man_got,"[yeah, shit, man, got, movie, like, blackout, ...","[movie, monday monday monday, ha ha, doing shi...","[man, movie, like, blackout, monday monday, ai...",[yo hello all right we're here. what's going ...
7,6,31,6_bethel_congregation_bethelites_circuit,"[bethel, congregation, bethelites, circuit, el...","[bethelites, congregation, jehovah witnesses, ...","[congregation, bethelites, elders, jehovah, ov...",[It was really bad. That was the whole focus ...
8,7,27,7_women_men_know_like,"[women, men, know, like, chick, pill, black, w...","[documentary, black women, white women, black ...","[women, know, like, pill, porn, black men, jac...",[now available on paperback and kindle unlimit...



Topic Representations After COVID for BLACK:

Number of Topic: 10


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,113,-1_know_like_just_people,"[know, like, just, people, don, got, jehovah, ...","[witness, jehovah witnesses, witnesses, jehova...","[like, people, jehovah, black, family, say, ro...","[Damn, Money Man, let's go. They already took..."
1,0,122,0_lord_right_know_going,"[lord, right, know, going, people, high, oh, g...","[judah, israelites, israelite, jews, shalom, s...","[high, god, israel, said, just, lord oh lord, ...",[shalom call hello you. how about you? and i...
2,1,59,1_police_officers_know_going,"[police, officers, know, going, got, just, don...","[officer, police, cops, crime, officers, viole...","[police, officers, black, cops, officer, media...",[Welcome to Sergeant Dorsey Speaks. I'm a ret...
3,2,42,2_women_black_men_woman,"[women, black, men, woman, black women, white,...","[black women, white women, black men, black wo...","[woman, black women, white, black men, man, wh...",[It's time to talk about housewives and workin...
4,3,27,3_africa_african_france_countries,"[africa, african, france, countries, continent...","[african leaders, africa, africans, african co...","[africa, france, countries, continent, african...",[Permit me to annoy you. In many African nati...
5,4,25,4_home home_home home home_purchase_tracks today,"[home home, home home home, purchase, tracks t...","[black farmers, farmers, farmer, plots, donkey...","[home, today purchase tracks, donkey, land, ho...","[You don't need any more of those. Yes, depend..."
6,5,23,5_covid_know_black folk_masks,"[covid, know, black folk, masks, thank, folk, ...","[coronavirus, covid 19, covid, pandemic, quara...","[covid, know, black folk, masks, vaccine, coro...",[What you just heard was an interview by phone...
7,6,20,6_black_white_biden_black people,"[black, white, biden, black people, people, jo...","[black leaders, care black lives, black commun...","[white, biden, black people, black leaders, sa...",[Lie can go around the world three times befor...
8,7,17,7_yeah_bro_like_shit,"[yeah, bro, like, shit, nigga, got, car, know,...","[nigga, truck, talking, niggas, dmx, motherfuc...","[like, nigga, car, man, atlanta, son, dmx, tow...","[I got to talk about DMX. Hold on, let me twe..."
9,8,16,8_mean_man_yo_people,"[mean, man, yo, people, don, shit, ain, day, e...","[racial hatred, racism, man don, nigga, youtub...","[mean, yo, ain, black, black people, videos, l...",[hey yo what up youtube y'all know who it is. ...


## all-mpnet-base-v2 - SUMMARIZED

In [15]:
# Function to train BERTopic with specific configurations
def train_bertopic_for_ideology(texts, ideology):
    ideology_texts = texts[texts['channel.ideology'] == ideology]['summary'].tolist()
    print(f"Training model for ideology: {ideology} with {len(ideology_texts)} texts.")
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    keybert = KeyBERTInspired()
    mmr = MaximalMarginalRelevance(diversity=0.3)
    representation_model = {
        "KeyBERT": keybert,
        "MMR": mmr,
    }
    topic_model = BERTopic(
        # verbose=True
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model,
        embedding_model="all-mpnet-base-v2",
        # Hyperparameters
        top_n_words=10,
    )
    topics, probs = topic_model.fit_transform(ideology_texts)
    return topic_model, topics, probs

# Dictionary to store models and topics
models_before = {}
models_after = {}

for ideology in ideologies:
    try:
        print("Before COVID")
        model_before, topics_before, probs_before = train_bertopic_for_ideology(
            texts_before_summarized, ideology)
        models_before[ideology] = (model_before, topics_before, probs_before)
        
        print("After COVID")
        model_after, topics_after, probs_after = train_bertopic_for_ideology(
            texts_after_summarized, ideology)
        models_after[ideology] = (model_after, topics_after, probs_after)
    except Exception as e:
        print(f"Error occurred for ideology {ideology}: {e}")

# Analyze and compare topics
for ideology in ideologies:
    model_before, topics_before, _ = models_before[ideology]
    model_after, topics_after, _ = models_after[ideology]
    
    # Get topic representations
    topics_info_before = model_before.get_topic_info()
    topics_info_after = model_after.get_topic_info()
    
    print(f"\nTopic Representations Before COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_before)}")
    display(topics_info_before)

    print(f"\nTopic Representations After COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_after)}")
    display(topics_info_after)


Before COVID
Training model for ideology: BLACK with 454 texts.
After COVID
Training model for ideology: BLACK with 464 texts.

Topic Representations Before COVID for BLACK:

Number of Topic: 4


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,36,-1_youtuber_michael_youtuber discusses_ashley,"[youtuber, michael, youtuber discusses, ashley...","[90 day fiancé, fiancé, day fiancé, couples, r...","[michael, youtuber discusses, ashley, reviewer...",[The YouTuber discusses the first half of the ...
1,0,176,0_black_speaker_people_black people,"[black, speaker, people, black people, white, ...","[black community, white supremacy, black men, ...","[black, speaker, people, black people, police,...","[The speaker is discussing the SYSBM movement,..."
2,1,170,1_speaker_god_people_high,"[speaker, god, people, high, bible, book, isra...","[bible, biblical, prophets, prophecy, israelit...","[people, bible, references, israelites, argue,...",[The speaker emphasizes the importance of know...
3,2,72,2_africa_video_ghana_viewers,"[africa, video, ghana, viewers, african, gambi...","[documentary, video, growth, africa web tv, ha...","[africa, ghana, viewers, african, gambia, visi...","[The video is a vlog by Juliet, a member of th..."



Topic Representations After COVID for BLACK:

Number of Topic: 5


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,74,-1_organization_jehovah_witnesses_jehovah witn...,"[organization, jehovah, witnesses, jehovah wit...","[encourages listeners, jehovah witnesses, jeho...","[organization, jehovah witnesses, watchtower, ...","[Hello everyone, welcome back to my channel. T..."
1,0,212,0_black_speaker_women_people,"[black, speaker, women, people, men, white, co...","[police, violence, racism, officer, black comm...","[speaker, police, black men, video, violence, ...",[A disturbing incident occurred in a Walmart p...
2,1,124,1_speaker_god_people_israelites,"[speaker, god, people, israelites, judah, bibl...","[israelites, israelite, gentiles, jews, big ju...","[israelites, judah, bible, israel, biblical, h...",[Dante Fortson is a YouTube creator who challe...
3,2,41,2_africa_african_countries_africans,"[africa, african, countries, africans, speaker...","[african leaders, african nations, africa, afr...","[africa, african, africans, gambia, developmen...",[African leaders have made statements that lef...
4,3,13,3_state_covid_covid 19_cross river,"[state, covid, covid 19, cross river, river st...","[covid 19, pandemic, covid 19 dr, quarantine, ...","[covid, covid 19, cross river state, vaccine, ...","[The Blacksit family has expanded, and the fat..."


## longformer

In [10]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

def truncate_sequences(sequences, max_length, tokenizer):
    truncated_sequences = []
    for seq in sequences:
        tokenized = tokenizer(seq, truncation=True, max_length=max_length, return_tensors='pt')
        truncated_seq = tokenizer.decode(tokenized.input_ids[0], skip_special_tokens=True)
        truncated_sequences.append(truncated_seq)
    return truncated_sequences

def verify_truncation_logic(sequences, max_length, tokenizer):
    for seq in sequences:
        tokenized = tokenizer(seq, return_tensors='pt')
        original_length = len(tokenized.input_ids[0])
        
        truncated_seq = truncate_sequences([seq], max_length, tokenizer)[0]
        truncated_tokenized = tokenizer(truncated_seq, return_tensors='pt')
        truncated_length = len(truncated_tokenized.input_ids[0])
        
        print(f"Original Length: {original_length}, Truncated Length: {truncated_length}")
        print(f"Original Sequence: {seq}")
        print(f"Truncated Sequence: {truncated_seq}")
        print(f"Truncated correctly: {truncated_length <= max_length}\n")

# Example sequences
sequences = [
    "This is a short sentence.",
    "This is a longer sentence that will be tokenized and truncated if it exceeds the maximum length set for the tokenizer. " * 250, # artificially long sequence
    "Another short one."
]

max_length = 4096
verify_truncation_logic(sequences, max_length, tokenizer)

Original Length: 8, Truncated Length: 8
Original Sequence: This is a short sentence.
Truncated Sequence: This is a short sentence.
Truncated correctly: True

Original Length: 6253, Truncated Length: 4096
Original Sequence: This is a longer sentence that will be tokenized and truncated if it exceeds the maximum length set for the tokenizer. This is a longer sentence that will be tokenized and truncated if it exceeds the maximum length set for the tokenizer. This is a longer sentence that will be tokenized and truncated if it exceeds the maximum length set for the tokenizer. This is a longer sentence that will be tokenized and truncated if it exceeds the maximum length set for the tokenizer. This is a longer sentence that will be tokenized and truncated if it exceeds the maximum length set for the tokenizer. This is a longer sentence that will be tokenized and truncated if it exceeds the maximum length set for the tokenizer. This is a longer sentence that will be tokenized and truncated 

In [11]:
from transformers import LongformerTokenizer, LongformerModel, AutoTokenizer
import torch
import numpy as np
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

def truncate_sequences(sequences, max_length):
    truncated_sequences = []
    for seq in sequences:
        tokenized = tokenizer(seq, truncation=True, max_length=max_length, return_tensors='pt')
        truncated_sequences.append(tokenizer.decode(tokenized.input_ids[0], skip_special_tokens=True))
    return truncated_sequences

# Function to train BERTopic with specific configurations
def train_bertopic_for_ideology(texts, ideology):
    ideology_texts = texts[texts['channel.ideology'] == ideology]['transcripts'].tolist()
    ideology_texts = truncate_sequences(ideology_texts, 4096)
    print(f"Training model for ideology: {ideology} with {len(ideology_texts)} texts.")
    vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    keybert = KeyBERTInspired()
    mmr = MaximalMarginalRelevance(diversity=0.3)
    representation_model = {
        "KeyBERT": keybert,
        "MMR": mmr,
    }
    topic_model = BERTopic(
        # verbose=True
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model,
        embedding_model="allenai/longformer-base-4096",
        # Hyperparameters
        top_n_words=10,
    )
    topics, probs = topic_model.fit_transform(ideology_texts)
    return topic_model, topics, probs

# Dictionary to store models and topics
models_before = {}
models_after = {}

for ideology in ideologies:
    try:
        print("Before COVID")
        model_before, topics_before, probs_before = train_bertopic_for_ideology(
            texts_before, ideology)
        models_before[ideology] = (model_before, topics_before, probs_before)
        
        print("After COVID")
        model_after, topics_after, probs_after = train_bertopic_for_ideology(
            texts_after, ideology)
        models_after[ideology] = (model_after, topics_after, probs_after)
    except Exception as e:
        print(f"Error occurred for ideology {ideology}: {e}")

# Analyze and compare topics
for ideology in ideologies:
    model_before, topics_before, _ = models_before[ideology]
    model_after, topics_after, _ = models_after[ideology]
    
    # Get topic representations
    topics_info_before = model_before.get_topic_info()
    topics_info_after = model_after.get_topic_info()
    
    print(f"\nTopic Representations Before COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_before)}")
    display(topics_info_before)

    print(f"\nTopic Representations After COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_after)}")
    display(topics_info_after)


Before COVID


No sentence-transformers model found with name allenai/longformer-base-4096. Creating a new one with mean pooling.


Training model for ideology: BLACK with 454 texts.


Input ids are automatically padded to be a multiple of `config.attention_window`: 512


After COVID
Training model for ideology: BLACK with 464 texts.


No sentence-transformers model found with name allenai/longformer-base-4096. Creating a new one with mean pooling.



Topic Representations Before COVID for BLACK:

Number of Topic: 2


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,0,426,0_know_people_going_like,"[know, people, going, like, don, just, right, ...","[black people, everybody, believe, trying, say...","[know, people, just, got, come, think, okay, r...",[This is your brother Malcolm coming at you wi...
1,1,28,1_jesus jesus_thank_conference_jesus jesus jesus,"[jesus jesus, thank, conference, jesus jesus j...","[miami international airport, embassy suites h...","[jesus jesus, jesus jesus jesus, jesus, krispy...","[Hello, family. Thank you for your continued ..."



Topic Representations After COVID for BLACK:

Number of Topic: 2


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,0,417,0_know_people_going_like,"[know, people, going, like, don, just, black, ...","[black people, everybody, don want, believe, t...","[know, people, just, come, think, okay, oh, ye...","[Good rising, brethren. This is Big Judah, co..."
1,1,47,1_thank_chardian_pearson_light,"[thank, chardian, pearson, light, carlton, car...","[informative educational videos, suggestions n...","[carlton, carlton pearson, triggers, steal awa...","[Hello, I'm Carlton Pearson. Listen to this w..."
