In [1]:
import os
import re
import regex
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
import umap
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline, AutoTokenizer

%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=4

pd.set_option('display.max_colwidth', None)


# Load a pre-trained sentence transformer model
# model = SentenceTransformer("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)
model = SentenceTransformer("all-mpnet-base-v2")

model_llm = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_llm)
device = 0 if torch.cuda.is_available() else -1

generator = pipeline(
    "text-generation",
    model=model_llm,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=device,
    do_sample=False,  # No sampling
    temperature=None,  # Not needed when do_sample=False
    top_p=None,        # Not needed when do_sample=False
    pad_token_id=tokenizer.eos_token_id  # Ensure proper padding
)

  from tqdm.autonotebook import tqdm, trange


env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=4




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
# Set the folder containing the files
folder_path = "../Llama3_1_Model_2/topicGPT/data/output/FINAL"

# List of unique ideologies
ideologies = ["ANTI_SJW", "ANTI_THEIST", "BLACK", "CONSPIRACY", "LGBT", "LIBERTARIAN", 
              "MRA", "PARTISAN_RIGHT", "PARTISAN_LEFT", "QANON", "RELIGIOUS_CONSERVATIVE", 
              "SOCIAL_JUSTICE", "SOCIALIST", "WHITE_IDENTITARIAN"]
# ideologies = ["ANTI_SJW"]

def remove_parenthesized_phrases(text):
    # Use regex to find and remove all substrings that start with '(' and end with ')'
    cleaned_text = re.sub(r'\s*\(.*?\)', '', text)
    return cleaned_text.strip()

def doc_label(df, topics_list):
    """
    Add labels to each document based on the topics generated for it.
    - df: dataframe of documents
    - topics_list: list of topics
    """
    pattern = regex.compile("^\[(\d+)\] ([\w\s]+):(.+)")
    all_topics = []
    for line in df["responses"].tolist():
        if type(line) == str:
            line = line.split("\n")
            line_topics = []
            for topic in line:
                if regex.match(pattern, topic):
                    groups = regex.match(pattern, topic).groups()
                    lvl, name, desc = int(groups[0]), groups[1], groups[2]
                    if f"[{lvl}] {name}" in topics_list:
                        line_topics.append(remove_parenthesized_phrases(desc))
                        # line_topics.append(topic)
            line_topics = list(set(line_topics))
            if len(line_topics) > 0:
                all_topics.append(line_topics)
            else:
                all_topics.append(["None"])
        else:
            all_topics.append(["None"])
    return all_topics

# Function to extract main topic and its subtopics dynamically based on a specified topic name
def extract_topic_data_per_ideology_period(file_path, main_topic="Health"):
    try:
        df = pd.read_json(str(file_path), lines=True)
        topics_list = [f"[1] {main_topic}"]
        df["topics"] = doc_label(df, topics_list)
        # Excluding rows with more than one unique topic//"None" ----
        df["num_topics"] = df["topics"].apply(lambda x: len(set(x)))
        df = df[df["topics"].apply(lambda x: x != ["None"])].reset_index(drop=True)

        all_embeddings = []
        concatinated_topics = ""
    
        for index, row in df.iterrows():
            # Step 1: Concatenate all sentences in the 'topics' column for this row
            concatenated_text = ' '.join(row['topics'])  # Join all sentences in the list
            # print("concatenated_text", concatenated_text)
            # Step 2: Generate embedding for the concatenated text
            embedding = model.encode(concatenated_text, convert_to_tensor=True)  # This will be on the GPU (cuda:0)
            all_embeddings.append(embedding.cpu())  # Move to CPU immediately and append to list
            concatinated_topics = concatinated_topics + " - " + concatenated_text
        
        # Step 3: Stack all embeddings and compute the average embedding
        stacked_embeddings = torch.stack(all_embeddings)  # Now all embeddings are on the CPU
        average_embedding = torch.mean(stacked_embeddings, dim=0)  # Compute the mean across all rows

        return {
            "average_embedding": average_embedding,
            "concatinated_topics": concatinated_topics,
            "freq": len(df.index)
        }
    
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return {}

def generate_text(prompt, max_tokens):
    """
    Generate text using a local model with transformers.
    """
    messages = [
        {"role": "system", "content": "You are equipped with advanced analytical techniques. Your goal is to distill complex information from various topics and summarize the overall subject being discussed in 1 paragraph. Return only the results, without any additional comments."},
        {"role": "user", "content": f"Analyze the following list of topics:\n\n{prompt}"},
    ]
    
    outputs = generator(
        messages,
        max_new_tokens=max_tokens,
    )
    return outputs[0]["generated_text"][-1]["content"]

def generate_graph(main_topic):
    # DataFrame to hold extracted data
    data = []
    
    # Main topic to analyze (can be changed dynamically)
    main_topic = main_topic
    
    # Iterate over unique ideologies and time phases
    for ideology in ideologies:
        for time_phase in ["before", "after"]:  # Check both after and before files
            
            # Construct markdown filename
            filename = f"assignment_{time_phase}_{ideology}_FULL.jsonl"
            file_path = os.path.join(folder_path, filename)    

            topic_data = extract_topic_data_per_ideology_period(file_path, main_topic)

            summarized = generate_text(topic_data["concatinated_topics"], 384)

            print("TOPIC: ", main_topic)
            print("IDEOLOGY: ", ideology)
            print("TIME PHASE: ", time_phase)
            print("SUMMARIZED: ", summarized)
            print("\n--------------------------------------------------------------------\n")
            # print(topic_data["concatinated_topics"])

            summarized_embedding = model.encode(summarized, convert_to_tensor=True)  # This will be on the GPU (cuda:0)

            
            # Append the data
            data.append({
                "ideology": ideology,
                "period": time_phase,
                # "embedding": topic_data["average_embedding"].cpu().numpy(),
                "embedding": summarized_embedding.cpu().numpy(),
            })
    
    # Convert data into a DataFrame
    df = pd.DataFrame(data)
    
    # Extract embeddings and labels
    X = np.array(df['embedding'].tolist())  # Embeddings
    y_ideology = df['ideology']  # Ideology
    y_period = df['period']  # Period (before/after)
    
    # Label encode the ideologies for visualization
    le_ideology = LabelEncoder()
    y_ideology_encoded = le_ideology.fit_transform(y_ideology)
    
    # Apply UMAP to project the embeddings into a lower-dimensional space (2D)
    umap_model = umap.UMAP(n_components=2, random_state=42)
    X_umap = umap_model.fit_transform(X)
    
    # Plot the UMAP projection to visualize ideologies with labels
    plt.figure(figsize=(12, 8))
    colors = {'before': 'blue', 'after': 'orange'}
    markers = {'before': 'o', 'after': 's'}
    
    # To track whether we have added a label for each period
    label_added = {'before': False, 'after': False}
    
    for i, (x, y, ideology, period) in enumerate(zip(X_umap[:, 0], X_umap[:, 1], df['ideology'], df['period'])):
        # Add label for the legend only once per period
        if not label_added[period]:
            plt.scatter(x, y, color=colors[period], marker=markers[period], label=f'Period: {period}', alpha=0.7)
            label_added[period] = True
        else:
            plt.scatter(x, y, color=colors[period], marker=markers[period], alpha=0.7)
        
        # Annotate the points with ideology
        plt.annotate(ideology, (x, y), fontsize=9, alpha=0.8)
    
    # Set up the plot details
    plt.title("UMAP Projection of Ideologies (Before and After COVID)")
    plt.xlabel("UMAP Component 1")
    plt.ylabel("UMAP Component 2")
    plt.legend(loc='best')  # Legend showing both periods
    plt.grid(True)
    plt.show()
    
    # # Analyze cosine similarity between ideologies before and after COVID
    # similarities = {}
    # for ideology in le_ideology.classes_:
    #     before_emb = X_umap[(df['ideology'] == ideology) & (df['period'] == 'before')]
    #     after_emb = X_umap[(df['ideology'] == ideology) & (df['period'] == 'after')]
        
    #     if before_emb.shape[0] > 0 and after_emb.shape[0] > 0:
    #         # Calculate cosine similarity between embeddings before and after COVID for each ideology
    #         sim = cosine_similarity(before_emb.mean(axis=0).reshape(1, -1), 
    #                                 after_emb.mean(axis=0).reshape(1, -1))[0][0]
    #         similarities[ideology] = sim
    
    # # Output the cosine similarity results
    # print(f"Cosine Similarity between Ideologies Before and After COVID ({main_topic} only):")
    # for ideology, similarity in similarities.items():
    #     print(f"{ideology}: {similarity:.4f}")
    
    # Analyze cosine similarity between ideologies before and after COVID
    similarities = {}
    for ideology in le_ideology.classes_:
        before_emb = X[(df['ideology'] == ideology) & (df['period'] == 'before')]
        after_emb = X[(df['ideology'] == ideology) & (df['period'] == 'after')]
    
        if before_emb.shape[0] > 0 and after_emb.shape[0] > 0:
            # Calculate cosine similarity between embeddings before and after COVID for each ideology
            sim = cosine_similarity(before_emb.mean(axis=0).reshape(1, -1), 
                                    after_emb.mean(axis=0).reshape(1, -1))[0][0]
            similarities[ideology] = sim
    
    # Output the cosine similarity results
    print(f"Cosine Similarity between Ideologies Before and After COVID ({main_topic} only):")
    for ideology, similarity in similarities.items():
        print(f"{ideology}: {similarity:.4f}")

In [3]:
topics = [
    "Politics", 
    "Government", 
    "Community", 
    "Human Rights", 
    "Identity", 
    "Social Justice", 
    "Culture", 
    "Human Behavior", 
    "Education", 
    "Relationships", 
    "Personal Growth", 
    "Society",  
    "Health",
    "Economy",
    "Law Enforcement",
    "Social Commentary",
    "Media",
    "Faith",
    "Leadership",
    "History"
]

for t in topics:
    generate_graph(t)

  df = pd.read_json(str(file_path), lines=True)


ValueError: Expected object or value