In [None]:
#!pip install bitsandbytes

## Libraries to Import

In [17]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re
import matplotlib.pyplot as plt

## Clean The Dataset

## Step 1: Load Dataset

In [2]:
# Load the dataset
file_path = "/Users/student/Documents/Spring Babson 2025/Advanced Programming/Week 4/drone_comments.csv"

# Try loading the dataset with encoding handling
try:
    df = pd.read_csv(file_path, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding="ISO-8859-1")  # Fallback encoding

# Ensure 'commentBody' column exists
if 'commentBody' not in df.columns:
    raise ValueError("The dataset must contain a 'commentBody' column.")

# Handle missing values (replace NaNs with empty string)
df['commentBody'] = df['commentBody'].fillna("")

# Convert all comments to string format
df['commentBody'] = df['commentBody'].astype(str)

# Function to clean comments and remove unwanted leading characters
def clean_comment(text):
    """
    Cleans a comment by removing @ mentions, URLs, extra spaces, and unwanted leading characters.
    """
    text = re.sub(r"@\w+", "", text)  # Remove @ mentions
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^A-Za-z0-9.,!?()'\s]", "", text)  # Remove special characters except punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces

    # Remove leading 'M' or '.' if present
    text = re.sub(r"^[M.]\s*", "", text)  # Removes leading M or . followed by a space

    return text

# Apply cleaning function to commentBody
df['commentBody'] = df['commentBody'].apply(clean_comment)

# Save cleaned dataset to Excel
updated_cleaned_file_path = "/Users/student/Documents/Spring Babson 2025/Advanced Programming/Week 4/updated_cleaned_drone_comments.xlsx"
df.to_excel(updated_cleaned_file_path, index=False)

print(f"Cleaned dataset saved to: {updated_cleaned_file_path}")




Cleaned dataset saved to: /Users/student/Documents/Spring Babson 2025/Advanced Programming/Week 4/updated_cleaned_drone_comments.xlsx


## Step 2: Load the Model and Tokenizer

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "AmanPriyanshu/Dynamic-Topic-Modeling-Llama-3.2-1B-bnb-4bit", 
    use_fast=False  # Prevents errors with some tokenizers
)

# Load model in 4-bit precision
model = AutoModelForCausalLM.from_pretrained(
    "AmanPriyanshu/Dynamic-Topic-Modeling-Llama-3.2-1B-bnb-4bit",
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto"  # Auto-detect CPU or GPU
)

# Set model to evaluation mode
model.eval()

# Verify Model Loaded Successfully
print("Model successfully loaded!")



Model successfully loaded!


## Step 4: Define a Function for Topic Generation

In [7]:
def generate_topic(comment, max_new_tokens=50, temperature=0.7, top_p=0.9, do_sample=True):
    """
    Generates a topic for a given comment using the Llama model.
    """
    # Prepare input prompt
    prompt = f"Generate a topic for the following comment:\n\n{comment}\n\nTopic:"

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs['input_ids'],
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=do_sample,  # Explicitly enabling sampling
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode and extract the topic
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    topic = generated_text.split("Topic:")[-1].strip()
    
    return topic


## Step 5: Apply Topic Modeling to the Dataset

In [9]:
# Apply the function to each comment
df['Generated_Topic'] = df['commentBody'].apply(lambda x: generate_topic(str(x)))

# Display the first few rows with topics
display(df.head())


Unnamed: 0,web_url,title,status,commentSequence,userID,userDisplayName,userLocation,userTitle,userURL,picURL,...,editorsSelection,parentID,parentUserDisplayName,depth,commentType,trusted,recommendedFlag,permID,isAnonymous,Generated_Topic
0,https://www.nytimes.com/2024/12/14/nyregion/dr...,Weeks of Drone Sightings Leave New Jersey on Edge,approved,138304127,104428496,Michael Shea,"Portland, Oregon",,,,...,False,,,1,comment,0,0,138304127,False,What is suspicious is the abject lack of infor...
1,https://www.nytimes.com/2024/12/14/nyregion/dr...,Weeks of Drone Sightings Leave New Jersey on Edge,approved,138304775,60459533,Brooklyncowgirl,Down In the Pines Of Jersey,,,,...,False,138304127.0,Michael Shea,2,userReply,0,0,138304775,False,Why the government is so corrupt?\nWhy the gov...
2,https://www.nytimes.com/2024/12/14/nyregion/dr...,Weeks of Drone Sightings Leave New Jersey on Edge,approved,138305184,76287895,Blackcat66,NJ,,,,...,False,138304127.0,Michael Shea,2,userReply,0,0,138305184,False,"Congress\nTags: Congress, FBI\nComments: 8, th..."
3,https://www.nytimes.com/2024/12/14/nyregion/dr...,Weeks of Drone Sightings Leave New Jersey on Edge,approved,138305174,64672229,David,costa mesa,,,,...,False,138304127.0,Michael Shea,2,userReply,0,0,138305174,False,What are the risks of the 2nd amendment?\nWhat...
4,https://www.nytimes.com/2024/12/14/nyregion/dr...,Weeks of Drone Sightings Leave New Jersey on Edge,approved,138307998,62805527,John Shelton,Ahwahnee CA,,,,...,False,138304127.0,Michael Shea,2,userReply,0,0,138307998,False,General Discussion\nRe: General Discussion\nPo...


## Step 6: Save the Results

In [11]:
df.to_csv("drone_comments_with_topics.csv", index=False)
print("File saved: drone_comments_with_topics.csv")


File saved: drone_comments_with_topics.csv


## Step 7: Clean And Simplify Topics

In [14]:
# Load dataset
topics_file_path = '/Users/student/Documents/Spring Babson 2025/Advanced Programming/Week 4/drone_comments_with_topics.csv'
df_topics = pd.read_csv(topics_file_path)

# Function to clean and standardize topics
def clean_topic(topic):
    """
    Cleans and standardizes generated topics by removing repetitive text,
    extra spaces, and normalizing common themes.
    """
    topic = str(topic).strip()

    # Remove unnecessary prefixes like "Topic:"
    topic = re.sub(r"^Topic[:\s]*", "", topic, flags=re.IGNORECASE)

    # Remove excessive "Re: Re: Re:" chains
    topic = re.sub(r"(Re:\s*)+", "", topic, flags=re.IGNORECASE)

    # Normalize common topics
    topic_map = {
        "General": "General Discussion",
        "Government": "Government & Policy",
        "UFO": "Unidentified Aerial Phenomena",
        "Surveillance": "Drone Surveillance",
        "Security": "Drone Security Concerns",
        "Military": "Military Drones",
        "Privacy": "Privacy Issues",
    }

    # Standardize topics if they match common ones
    topic = topic_map.get(topic, topic)

    # Remove single-character or meaningless topics
    if len(topic) < 3:
        topic = "Miscellaneous"

    return topic

# Apply cleaning function
df_topics['Simplified_Topic'] = df_topics['Generated_Topic'].apply(clean_topic)

# Save cleaned topics dataset
cleaned_topics_file = '/Users/student/Documents/Spring Babson 2025/Advanced Programming/Week 4/cleaned_drone_topics.csv'
df_topics.to_csv(cleaned_topics_file, index=False)

print(f"Cleaned topics dataset saved at: {cleaned_topics_file}")


Cleaned topics dataset saved at: /Users/student/Documents/Spring Babson 2025/Advanced Programming/Week 4/cleaned_drone_topics.csv


## Step 8: Generalize Topics Even More

In [24]:
# Load the dataset
cleaned_topics_file_path = '/Users/student/Documents/Spring Babson 2025/Advanced Programming/Week 4/cleaned_drone_topics.csv'
df_cleaned_topics = pd.read_csv(cleaned_topics_file_path)

# Function to clean and standardize topics
def better_clean_topic(topic):
    """
    Cleans and categorizes topics into meaningful groups.
    """
    topic = str(topic).strip().lower()  # Convert to lowercase
    
    # Remove unnecessary words, special characters, and numbers
    topic = re.sub(r"[^a-zA-Z\s]", "", topic)  # Keep only letters and spaces
    topic = re.sub(r"\b(comment|topic|the|what is|how to|how can)\b", "", topic)  # Remove filler words
    topic = re.sub(r"\s+", " ", topic).strip()  # Remove extra spaces
    
    # Mapping of general topic categories
    topic_map = {
        "government": "Government & Policy",
        "politics": "Government & Policy",
        "military": "Military Drones",
        "military drones": "Military Drones",
        "ufo": "Unidentified Aerial Phenomena",
        "aliens": "Unidentified Aerial Phenomena",
        "surveillance": "Drone Surveillance",
        "security": "Drone Security Concerns",
        "privacy": "Privacy Issues",
        "general discussion": "General Discussion",
        "drones": "Drones & Technology",
        "drone": "Drones & Technology",
        "aviation": "General Aviation",
        "air traffic": "General Aviation",
        "safety": "Drone Safety",
        "law": "Drone Laws & Regulations",
        "regulations": "Drone Laws & Regulations",
        "camera": "Drone Photography",
        "photography": "Drone Photography",
    }

    # Assign standardized topic if it matches
    topic = topic_map.get(topic, topic)

    # If topic is still generic, categorize as Miscellaneous
    if len(topic) < 3:
        topic = "Miscellaneous"

    return topic

# Apply improved cleaning function
df_cleaned_topics['Better_Simplified_Topic'] = df_cleaned_topics['Simplified_Topic'].apply(better_clean_topic)

# Save new cleaned dataset
final_cleaned_topics_file = "/Users/student/Documents/Spring Babson 2025/Advanced Programming/Week 4/final_cleaned_drone_topics.csv"
df_cleaned_topics.to_csv(final_cleaned_topics_file, index=False)

# Display cleaned topic distribution
df_cleaned_topics['Better_Simplified_Topic'].value_counts().head(20)


Better_Simplified_Topic
nan                                                           149
Miscellaneous                                                  75
miscellaneous                                                  55
Drones & Technology                                            36
General Discussion                                             25
Government & Policy                                            15
government policy                                              10
unidentified aerial phenomena                                   8
general discussion pages                                        5
in out in out in out in out in out in out in out in out in      4
discussion                                                      4
news                                                            4
ufos                                                            3
Military Drones                                                 3
fear                                                