## Topic modeling using LDA
Instead of mannully catagorize these episode, we can also use LDA

## Load Data and Library

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

In [11]:
df = pd.read_csv('podcast_episodes.csv')
# combine the episode title and episode description into a single column
df['combined_text'] = df['Title'] + ' ' + df['Summary']
df.head()

Unnamed: 0,Title,Date,Category,Summary,Authors,combined_text
0,Infinite Tiling Presents a Modern Mathematical...,"OCTOBER 4, 2024",MATH,Today’s mathematicians grapple with higher-ord...,"KYNE SANTOS, RACHEL FELTMAN, FONDA MWANGI, MAD...",Infinite Tiling Presents a Modern Mathematical...
1,"Understanding Suzetrigine, a New Drug That Tre...","OCTOBER 2, 2024",MEDICINE,A new class of drugs treats pain at the periph...,"RACHEL FELTMAN, MARLA BROADFOOT, FONDA MWANGI","Understanding Suzetrigine, a New Drug That Tre..."
2,An Effort to Fight against the Spread of Misin...,"SEPTEMBER 30, 2024",TECHNOLOGY,Here’s how misinformation and distrust in scie...,"RACHEL FELTMAN, FONDA MWANGI, ANAISSA RUIZ TEJADA",An Effort to Fight against the Spread of Misin...
3,Is Math Part of Nature or an Invention of the ...,"SEPTEMBER 27, 2024",MATH,Mathematics communicator and drag queen Kyne S...,"KYNE SANTOS, RACHEL FELTMAN, FONDA MWANGI, MAD...",Is Math Part of Nature or an Invention of the ...
4,People with PCOS Face Increased Eating Disorde...,"SEPTEMBER 25, 2024",HEALTH CARE,A study reports higher prevalence of eating di...,"RACHEL FELTMAN, FONDA MWANGI, JEFFERY DELVISCIO",People with PCOS Face Increased Eating Disorde...


## Analysis

In [16]:

# Custom preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = text.split()
    # Remove stopwords and custom words
    stop_words = set(stopwords.words('english'))
    custom_stop_words = {'new', 'research', 'scientists', 'researchers', 'study', 'find', 'found', 'show', 'shows', 'science', 'scientific', 'american', 'podcast', 'episode', 'quickly'}
    stop_words.update(custom_stop_words)
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the combined text data
preprocessed_data = [preprocess_text(text) for text in df['combined_text']]

# Create and fit the TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf_matrix = vectorizer.fit_transform(preprocessed_data)

# Create and fit the LDA model
n_topics = 10  # You can adjust this number
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_output = lda_model.fit_transform(tfidf_matrix)

# Function to print top words for each topic
def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

# Print the topics
print("Topics found by LDA using TF-IDF:")
print_topics(lda_model, vectorizer.get_feature_names_out(), 10)

# Assign topics to each document
df['Dominant_Topic'] = lda_output.argmax(axis=1)

# Count the number of documents in each topic
topic_counts = df['Dominant_Topic'].value_counts().sort_index()
print("\nNumber of documents in each topic:")
print(topic_counts)

Topics found by LDA using TF-IDF:
Topic 1: brief, around, one, technology, report, prize, kavli, including, news, world
Topic 2: dog, human, year, intelligence, good, artificial, getting, get, could, health
Topic 3: award, bird, change, climate, sponsored, cancer, problem, day, host, work
Topic 4: human, year, ai, one, past, ice, help, may, space, moon
Topic 5: nuclear, report, christopher, music, intagliata, space, lung, reservation, would, weapon
Topic 6: face, immunity, dna, shot, math, talk, hope, large, likely, cleo
Topic 7: election, talk, offer, turtle, language, get, dinosaur, key, sea, arctic
Topic 8: covid, vaccine, pandemic, health, variant, today, editor, senior, american, josh
Topic 9: bird, brain, sound, people, bat, help, love, get, secret, may
Topic 10: could, life, might, mean, make, something, food, mind, year, wildlife

Number of documents in each topic:
Dominant_Topic
0    71
1    54
2    54
3    65
4    52
5    43
6    56
7    89
8    57
9    59
Name: count, dtype:

In [17]:
# give each topic a name
topic_names = {
    0: "Science Awards and Technology News",
    1: "Artificial Intelligence and Human Health",
    2: "Climate Change and Environmental Challenges",
    3: "Space Exploration and AI",
    4: "Nuclear Weapons and Science Reporting",
    5: "Immunology, Genetics, and Mathematics",
    6: "Environmental Conservation and Politics",
    7: "COVID-19 Pandemic and Health Updates",
    8: "Animal Communication and Neuroscience",
    9: "Wildlife and Human Experience"
}

# add the topic names to the dataframe
df['Dominant_Topic'] = df['Dominant_Topic'].map(topic_names)



            Category                            Dominant_Topic
0               MATH     Nuclear Weapons and Science Reporting
1           MEDICINE   Environmental Conservation and Politics
2         TECHNOLOGY             Wildlife and Human Experience
3               MATH     Immunology, Genetics, and Mathematics
4        HEALTH CARE      COVID-19 Pandemic and Health Updates
..               ...                                       ...
595  SPACE & PHYSICS             Wildlife and Human Experience
596          BIOLOGY  Artificial Intelligence and Human Health
597      ENVIRONMENT     Animal Communication and Neuroscience
598     THE SCIENCES     Immunology, Genetics, and Mathematics
599          BIOLOGY                  Space Exploration and AI

[600 rows x 2 columns]
