In [1]:
%%capture
!pip install bertopic accelerate bitsandbytes xformers adjustText

import json
import pandas as pd
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import umap
import hdbscan
import nltk
nltk.download('punkt')

pd.set_option("display.max_columns", None)

%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=3

In [2]:
# Load JSON files
with open('summarized_BEFORE_COVID.json', 'r') as file:
    before_covid_data = json.load(file)

with open('summarized_AFTER_COVID.json', 'r') as file:
    after_covid_data = json.load(file)

# Convert to DataFrame
before_covid_df = pd.json_normalize(before_covid_data)
after_covid_df = pd.json_normalize(after_covid_data)

print(before_covid_df.shape)
print(after_covid_df.shape)

# Extract necessary columns
texts_before = before_covid_df[['summary', 'channel.ideology']]
texts_after = after_covid_df[['summary', 'channel.ideology']]

# Get unique ideologies
ideologies = texts_before['channel.ideology'].unique()

(5714, 22)
(5726, 22)


In [3]:
texts_before = texts_before[texts_before['channel.ideology'] == "BLACK"]
texts_after = texts_after[texts_after['channel.ideology'] == "BLACK"]

# List of ideologies
ideologies = ["BLACK"]

In [4]:
# import re
# from nltk.tokenize import sent_tokenize

# def clean_text(text):
#      # Remove non-ascii characters
#     text = text.encode("ascii", "ignore").decode()

#     # Remove newlines and extra spaces
#     text = text.replace('\n', ' ').replace('\r', ' ').strip()

#     # Tokenize the text into sentences
#     sentences = sent_tokenize(text)

#     # Define unnecessary punctuation to remove
#     unnecessary_punctuation = r'[“”\'`~]'
    
#     # Remove unnecessary punctuation and special characters
#     cleaned_sentences = [re.sub(unnecessary_punctuation, '', sentence) for sentence in sentences]

#     # Optionally convert text to lowercase (comment this line if you want to keep the original case)
#     # cleaned_sentences = [sentence.lower() for sentence in cleaned_sentences]

#     # Join the cleaned sentences back into a single string
#     cleaned_text = ' '.join(cleaned_sentences)

#     return cleaned_text

# # Assuming texts_before and texts_after are pandas DataFrames
# texts_before['transcripts'] = texts_before['transcripts'].apply(clean_text)
# texts_after['transcripts'] = texts_after['transcripts'].apply(clean_text)

In [5]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Function to calculate average token size
def get_average_token_size(texts, tokenizer):
    total_tokens = 0
    total_texts = len(texts)
    
    for text in texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        total_tokens += len(tokens)
    
    average_size = total_tokens / total_texts if total_texts > 0 else 0
    return average_size

# Calculate average token size for texts_before and texts_after
average_token_size_before = get_average_token_size(texts_before['summary'], tokenizer)
average_token_size_after = get_average_token_size(texts_after['summary'], tokenizer)

print(f'Average token size for texts_before: {average_token_size_before}')
print(f'Average token size for texts_after: {average_token_size_after}')

# Function to calculate max token size
def get_max_token_size(texts, tokenizer):
    max_size = 0
    for text in texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        max_size = max(max_size, len(tokens))
    return max_size

# Calculate max token size for texts_before and texts_after
max_token_size_before = get_max_token_size(texts_before['summary'], tokenizer)
max_token_size_after = get_max_token_size(texts_after['summary'], tokenizer)

print(f'Max token size for texts_before: {max_token_size_before}')
print(f'Max token size for texts_after: {max_token_size_after}')

Average token size for texts_before: 231.44273127753303
Average token size for texts_after: 228.0625
Max token size for texts_before: 452
Max token size for texts_after: 435


In [6]:
# texts_before['transcripts'][0]

In [7]:
from huggingface_hub import notebook_login
notebook_login()
# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
from torch import cuda

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

cuda:0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
from torch import bfloat16
import transformers

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

In [10]:
# Initialize Llama2 Model
model_id_llama2 = 'meta-llama/Llama-2-7b-chat-hf'
tokenizer_llama2 = transformers.AutoTokenizer.from_pretrained(model_id_llama2)
model_llama2 = transformers.AutoModelForCausalLM.from_pretrained(
    model_id_llama2,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model_llama2.eval()

# Initialize Llama3.1 Model
model_id_llama3 = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
tokenizer_llama3 = transformers.AutoTokenizer.from_pretrained(model_id_llama3)
model_llama3 = transformers.AutoModelForCausalLM.from_pretrained(
    model_id_llama3,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model_llama3.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

In [11]:
# Text generator pipeline for Llama2
generator_llama2 = transformers.pipeline(
    model=model_llama2, tokenizer=tokenizer_llama2,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

# Text generator pipeline for Llama3.1
generator_llama3 = transformers.pipeline(
    model=model_llama3, tokenizer=tokenizer_llama3,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [12]:
# System prompt describes information given to all conversations
system_prompt2 = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""
# Example prompt demonstrating the output we are looking for
example_prompt2 = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""

# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt2 = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

prompt2 = system_prompt2 + example_prompt2 + main_prompt2

In [13]:
system_prompt3 = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful, respectful, and honest assistant for labeling topics. Your task is to generate a concise label for each topic provided. Please return only the label, consisting of one word or a short phrase, and nothing more.
"""

# Example User Prompt
example_prompt3 = """
<|eot_id|><|start_header_id|>user<|end_header_id|>
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure to only return the label and nothing more.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Environmental impacts of eating meat
"""

# Main Prompt for Labeling
main_prompt3 = """
<|eot_id|><|start_header_id|>user<|end_header_id|>
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure to only return the label and nothing more.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt3 = system_prompt3 + example_prompt3 + main_prompt3

In [14]:
# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama2
llama2 = TextGeneration(generator_llama2, prompt=prompt2)

# Text generation with Llama3.1
llama3 = TextGeneration(generator_llama3, prompt=prompt3)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    "Llama3": llama3,
    "MMR": mmr,
}

embedding_model = SentenceTransformer("all-mpnet-base-v2")
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [15]:
# Example text data for testing
test_text = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
"""

test_prompt2 = system_prompt2 + example_prompt2 + main_prompt2.replace("[DOCUMENTS]", test_text).replace("[KEYWORDS]", "meat, beef, eat, eating, emissions, steak, food, health, processed, chicken")
test_prompt3 = system_prompt3 + example_prompt3 + main_prompt3.replace("[DOCUMENTS]", test_text).replace("[KEYWORDS]", "meat, beef, eat, eating, emissions, steak, food, health, processed, chicken")

try:
    result_2 = generator_llama2(test_prompt2)
    print(result_2)
    result_3 = generator_llama3(test_prompt3)
    print(result_3)
except RuntimeError as e:
    print(f"Error during text generation: {e}")

[{'generated_text': "\n<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant for labeling topics.\n<</SYS>>\n\nI have a topic that contains the following documents:\n- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.\n- Meat, but especially beef, is the word food in terms of emissions.\n- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.\n\nThe topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.\n\nBased on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.\n\n[/INST] Environmental impacts of eating meat\n\n[INST]\nI have a topic that contains the following documents:\n\nI have a topic that contains the following documents:\n- Trad

In [16]:
def precalculate_embeddings(texts, embedding_model):
    embeddings_dict = {}
    ideologies = texts['channel.ideology'].unique()
    for ideology in ideologies:
        ideology_texts = texts[texts['channel.ideology'] == ideology]['summary'].tolist()
        embeddings = embedding_model.encode(ideology_texts, show_progress_bar=True)
        embeddings_dict[ideology] = embeddings
    return embeddings_dict

# Pre-calculate embeddings for before and after COVID
embeddings_before = precalculate_embeddings(texts_before, embedding_model)
embeddings_after = precalculate_embeddings(texts_after, embedding_model)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [17]:
import numpy as np

def check_embeddings(embeddings):
    for ideology, emb in embeddings.items():
        if np.any(np.isnan(emb)) or np.any(np.isinf(emb)):
            print(f"Invalid embeddings detected for ideology: {ideology}")

check_embeddings(embeddings_before)
check_embeddings(embeddings_after)

In [18]:
# Function to train BERTopic with specific configurations
def train_bertopic_for_ideology(texts, ideology, embeddings, vectorizer_model=None, ctfidf_model=None, representation_model=None):
    ideology_texts = texts[texts['channel.ideology'] == ideology]['summary'].tolist()
    print(f"Training model for ideology: {ideology} with {len(ideology_texts)} texts.")
    topic_model = BERTopic(
        embedding_model=embedding_model, 
        vectorizer_model=vectorizer_model, 
        ctfidf_model=ctfidf_model, 
        representation_model=representation_model,
        # Hyperparameters
        top_n_words=10,
        verbose=True
    )
    topics, probs = topic_model.fit_transform(ideology_texts, embeddings[ideology])
    return topic_model, topics, probs

In [19]:
# # List of ideologies
# ideologies = texts_before['channel.ideology'].unique()

# Dictionary to store models and topics
models_before = {}
models_after = {}

for ideology in ideologies:
    try:
        print("Before COVID")
        model_before, topics_before, probs_before = train_bertopic_for_ideology(
            texts_before, ideology, embeddings_before, vectorizer_model=vectorizer_model, 
            ctfidf_model=ctfidf_model, representation_model=representation_model
        )
        models_before[ideology] = (model_before, topics_before, probs_before)
        
        print("After COVID")
        model_after, topics_after, probs_after = train_bertopic_for_ideology(
            texts_after, ideology, embeddings_after, vectorizer_model=vectorizer_model, 
            ctfidf_model=ctfidf_model, representation_model=representation_model
        )
        models_after[ideology] = (model_after, topics_after, probs_after)
    except Exception as e:
        print(f"Error occurred for ideology {ideology}: {e}")


Before COVID
Training model for ideology: BLACK with 454 texts.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.07it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.37it/s]


After COVID
Training model for ideology: BLACK with 464 texts.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:06<00:00,  1.06s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.33it/s]


In [20]:
# Analyze and compare topics
for ideology in ideologies:
    model_before, topics_before, _ = models_before[ideology]
    model_after, topics_after, _ = models_after[ideology]

    model_before
    
    # Get topic representations
    topics_info_before = model_before.get_topic_info()
    topics_info_after = model_after.get_topic_info()

    print(f"\nTopic Representations Before COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_before)}")
    display(topics_info_before)
    # display(model_before.visualize_hierarchy())

    print(f"\nTopic Representations After COVID for {ideology}:")
    print(f"\nNumber of Topic: {len(topics_info_after)}")
    display(topics_info_after)
    # display(model_after.visualize_hierarchy())



Topic Representations Before COVID for BLACK:

Number of Topic: 5


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Llama2,Llama3,MMR,Representative_Docs
0,-1,21,-1_youtuber_youtuber discusses_ashley_reviewer,"[youtuber, youtuber discusses, ashley, reviewe...","[90 day fiancé, fiancé, day fiancé, couples, r...","[90 Day Fiancé Review, , , , , , , , , ]","[Reality TV Show Analysis, , , , , , , , , ]","[youtuber discusses, ashley, 90, michael, 90 d...",[The YouTuber discusses the first half of the ...
1,0,190,0_black_speaker_people_black people,"[black, speaker, people, black people, white, ...","[black community, white supremacy, black men, ...","[Racial tensions and conflicts in America, , ,...","[Racism and Black Community Issues, , , , , , ...","[speaker, black people, police, community, inc...","[Hello, I'm Dr. Claude Anderson, and welcome b..."
2,1,148,1_speaker_god_people_high,"[speaker, god, people, high, bible, book, isra...","[bible, biblical, speaker discusses concept, p...","[Black Identity and Heritage in the Bible, , ,...","[Black Identity and Biblical Interpretation, ,...","[bible, emphasizes, references, israelites, ju...",[The speaker emphasizes the importance of know...
3,2,76,2_africa_video_ghana_african,"[africa, video, ghana, african, viewers, gambi...","[documentary, video, growth, africa web tv, ha...","[African Hair Journeys and Travel Adventures, ...","[African cultural exchange, , , , , , , , , ]","[africa, ghana, african, viewers, gambia, visi...","[The video is a vlog by Juliet, a member of th..."
4,3,19,3_organization_bethelites_bethel_jt,"[organization, bethelites, bethel, jt, jehovah...","[jehovah witnesses organization, bethelites fr...","[Jehovah's Witnesses and Financial Practices, ...","[Jehovah's Witness Critique, , , , , , , , , ]","[organization, bethel, jehovah witnesses, cong...",[The conversation begins with two Jehovah's Wi...



Topic Representations After COVID for BLACK:

Number of Topic: 6


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Llama2,Llama3,MMR,Representative_Docs
0,-1,102,-1_black_organization_people_jehovah,"[black, organization, people, jehovah, witness...","[jehovah witnesses, jehovah witness, witness, ...",[Jehovah's Witnesses and Controversial Practic...,"[Jehovah's Witnesses Criticism, , , , , , , , , ]","[black, organization, speaker, jehovah witness...","[Hello, I'm Lady C, and welcome to The Critica..."
1,0,120,0_speaker_god_people_israelites,"[speaker, god, people, israelites, judah, bibl...","[israelites, israelite, gentiles, jews, big ju...","[Race, Identity, and Religion in America, , , ...","[Black Hebrew Identity, , , , , , , , , ]","[israelites, judah, bible, israel, argues, heb...",[Dante Fortson is a YouTube creator who challe...
2,1,84,1_africa_african_africans_countries,"[africa, african, africans, countries, contine...","[africa, nigeria, african leaders, covid 19, a...","[African Innovation during the Pandemic, , , ,...","[African Innovation During Covid-19, , , , , ,...","[africa, african, africans, countries, contine...","[Hello everyone, welcome back to my channel. T..."
3,2,68,2_black_speaker_people_black people,"[black, speaker, people, black people, communi...","[black community, white supremacy, racism, bla...","[Black Empowerment and Radical Action, , , , ,...","[Black nationalism and vigilantism, , , , , , ...","[speaker, black people, nfac, snoop, black com...","[Hello, welcome to PowerNumbers.com. Today, I'..."
4,3,52,3_police_incident_officers_woman,"[police, incident, officers, woman, dorsey, of...","[police, officer, crime, arrest, officers, vio...","[Racial Violence and Injustice, , , , , , , , , ]","[Police brutality and racial injustice, , , , ...","[police, incident, shot, narrator, case, viole...",[A disturbing incident occurred in a Walmart p...
5,4,38,4_women_men_black_black men,"[women, men, black, black men, black women, sp...","[masculinity, new age family, marriage, black ...","[Gender dynamics within the Black community, ,...","[Black male-female relationships, , , , , , , ...","[black men, relationships, white women, family...",[The speaker discusses the controversy surroun...
