# spaCy
## Example 1

In [None]:
import spacy
import os

nlp = spacy.load('en_core_web_trf') # loads the most accurate model

folder_path = "/Users/admin/Documents/Academic/Fall 2024/CS 491-101/transcripts"

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        print(f"Entities in '{filename.upper()}':\n")
        
        # Combine folder path with the filename to get the full path
        file_path = os.path.join(folder_path, filename)
        
        # Open the file using the full path
        with open(file_path, "r") as file:
            transcript = file.read()
        
        doc = nlp(transcript)
        
        # Create a dictionary to hold entities categorized by their labels
        entities_by_label = {}

        for ent in doc.ents:
            if ent.label_ not in entities_by_label:
                entities_by_label[ent.label_] = []  # Initialize list if label not seen
            entities_by_label[ent.label_].append(ent.text)  # Append entity text

        for label, entities in entities_by_label.items():
            formatted_entities = [f"'{entity}'" for entity in set(entities)]  # Enclose each entity in quotes
            print(f"{label}: {', '.join(formatted_entities)}")  # Join with commas

    
    print('─' * 60)  # prints a line the with of the terminal
    

        


### Output:
```
Entities in 'A ONE MINUTE TEDX TALK FOR THE DIGITAL AGE  WOODY ROSELAND  TEDXMILEHIGH.TXT':

DATE: 'years', 'two thousand and nine'
TIME: 'eighteen minute', 'forty four seconds', 'under a minute'
PERSON: 'ted'
CARDINAL: 'one'
────────────────────────────────────────────────────────────
Entities in 'THE 1 MINUTE THAT MIGHT CHANGE YOUR DAY - MOTIVATIONAL VIDEO ✔️.TXT':

TIME: 'those lonely nights'
DATE: 'today'
────────────────────────────────────────────────────────────
Entities in 'REVIEWER-PS-COM.TXT':

ORG: 'Facebook'
DATE: 'Fridays', 'years', '2009'
TIME: '44 seconds', '18-minute', 'under a minute'
CARDINAL: 'one'
FAC: 'Claus Plaza'
PRODUCT: 'Wheels'
────────────────────────────────────────────────────────────
Entities in 'TRUMP GETS SURPRISE TWIST ON 150 MILLION DEFAMATION AWARD.TXT':

PERSON: 'beryl howell', 'this ken caruso', 'annie', 'judi giuliani', 'rudy', 'shakespeare', 'biden', 'giuliani', 'christopher walken', 'j moss', 'lane', 'shea blas', 'de rudy giuliani', 'humphrey bogart', 'broody', 'rodolfo giuliani', 'e jean carroll', 'shamos', 'david lapp koski', 'italy', 'lauren bacall mercedes benz', 'cliff anger', 'lauren bacall', 'amos', 'michael', 'juliana', 'trump', 'joe biden', 'lowly', 'al sharpton', 'ruby freeman', 'donald trump', 'christopher walkins', 'ken caruso', 'rodolfo giuliani's', 'moss', 'david lop koski', 'bruce willis', 'michael kopek', 'docks', 'freeman', 'ken caruso's', 'rudy giuliani', 'ruby free'
MONEY: 'the one hundred', 'two million dollars', 'over three hundred thousand dollars', 'two million dollar', 'about ten million dollars', 'one hundred and fifty million', 'one hundred million dollar', 'the two million dollars'
...
PERCENT: 'twenty percent', 'one hundred percent'
PRODUCT: 'the nineteen eighty', 'mercedes'
QUANTITY: 'two hundred thousand', 'five and a half pages'
────────────────────────────────────────────────────────────
```
*Output is truncated.*

# spaCy
## Example 2 (via claude.ai)

In [None]:
import spacy
import os
from collections import Counter

# Path to the folder with the transcripts
folder_path = "/Users/admin/Documents/Academic/Fall 2024/CS 491-101/transcripts"

def process_transcript(file_path, nlp):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    doc = nlp(text)
    
    # Extract relevant information
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    key_words = [token.lemma_ for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ'] and not token.is_stop]
    
    return {
        'entities': entities,
        'noun_phrases': noun_phrases,
        'key_words': key_words
    }

def analyze_multiple_transcripts(folder_path):
    nlp = spacy.load("en_core_web_trf")
    
    all_entities = []
    all_noun_phrases = []
    all_key_words = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            result = process_transcript(file_path, nlp)
            
            all_entities.extend(result['entities'])
            all_noun_phrases.extend(result['noun_phrases'])
            all_key_words.extend(result['key_words'])
    
    # Analyze the combined data
    entity_freq = Counter(all_entities)
    noun_phrase_freq = Counter(all_noun_phrases)
    keyword_freq = Counter(all_key_words)
    
    return {
        'top_entities': entity_freq.most_common(20),
        'top_noun_phrases': noun_phrase_freq.most_common(20),
        'top_keywords': keyword_freq.most_common(50)
    }

def generate_summary(analysis):
    summary = "Podcast Transcripts Analysis Summary:\n\n"
    
    summary += "Top Entities (Name, Type, Frequency):\n"
    for (entity, entity_type), freq in analysis['top_entities']:
        summary += f"- {entity} ({entity_type}): {freq}\n"
    
    summary += "\nTop Noun Phrases:\n"
    for phrase, freq in analysis['top_noun_phrases']:
        summary += f"- {phrase}: {freq}\n"
    
    summary += "\nTop Keywords:\n"
    for word, freq in analysis['top_keywords']:
        summary += f"- {word}: {freq}\n"
    
    return summary

# Usage
analysis = analyze_multiple_transcripts(folder_path)
summary = generate_summary(analysis)

print(summary)

# The 'summary' variable now contains a comprehensive analysis of all transcripts,
# which can be used as input for a generative AI model to create a new podcast.


### Output:
```
Podcast Transcripts Analysis Summary:

Top Entities (Name, Type, Frequency):
- rudy giuliani (PERSON): 11
- florida (GPE): 8
- new york (GPE): 7
- donald trump (PERSON): 7
- rudy (PERSON): 5
- shakespeare (PERSON): 4
- two (CARDINAL): 4
- lane (PERSON): 4
- years (DATE): 3
- two million dollars (MONEY): 3
- first (ORDINAL): 3
- four (CARDINAL): 3
- trump (PERSON): 3
- christopher walken (PERSON): 3
- under a minute (TIME): 2
- one (CARDINAL): 2
- five (CARDINAL): 2
- surfside florida (GPE): 2
- manhattan (GPE): 2
- road (ORG): 2

Top Noun Phrases:
...
- give: 4
- work: 4
- apartment: 4
```

# Testing out NLTK for NLP Processing

## Overview
NLTK (Natural Language Toolkit) can be used for processing and analyzing transcripts. This library offers robust functionality for tokenization, part-of-speech tagging, and named entity recognition. While it provides detailed linguistic analysis, there are areas where it could be further improved or customized.

Here's a site containing the tags and their meanings: 
https://cs.nyu.edu/~grishman/jet/guide/PennPOS.html

### The library includes features such as:
- **Tokenization of text**
- **Part-of-speech tagging** using the Penn Treebank tagset
- **Named entity recognition**
- **Custom filtering of entities and parts of speech**

One of the strengths of this library is its ability to provide detailed part-of-speech information, which can be valuable for linguistic analysis. However, like any NLP tool, it may have limitations in accurately identifying certain complex named entities or handling ambiguous language constructs.

## Potential Improvements
- **Advanced Models**: We could explore more advanced NLTK models or even integrate other NLP libraries like spaCy or Stanford NLP for potentially improved accuracy in named entity recognition.
- **Custom Entity Recognition**: Implement domain-specific entity recognition for improved accuracy in particular contexts (e.g., legal, medical, or technical transcripts).
- **Error Handling**: Enhance the library's ability to handle and report potential misclassifications or ambiguities.
- **Performance Optimization**: For large volumes of text, we might need to consider optimizing the processing speed.
- **Integration with Other Tools**: Consider integrating with other analysis or visualization tools to provide more comprehensive insights from the processed text.

We should continue testing with a variety of transcripts to identify any consistent patterns of errors or misclassifications. This will help us fine-tune the library for better accuracy across different types of content.







In [3]:
import nltk
import os

# Set the NLTK_DATA path directly in the script
os.environ['NLTK_DATA'] = "/Users/admin/Documents/Academic/Fall 2024/CS 491-101/UFO-news/venv/nltk_data"
# print("NLTK_DATA:", os.getenv("NLTK_DATA"))

# To successfully manage the nltk data, in your terminal you must type:
# export NLTK_DATA=".../venv/nltk_data"

# Path to the folder with the transcripts
folder_path = "/Users/admin/Documents/Academic/Fall 2024/CS 491-101/transcripts"

def format_entity(entity):
    if isinstance(entity, nltk.Tree):
        return f"{entity.label()}: {' '.join(word for word, tag in entity.leaves())}"
    else:
        word, tag = entity
        return f"{word} ({get_tag_description(tag)})"

def get_tag_description(tag):
    tag_descriptions = {
        'CC': 'Coordinating conjunction',
        'CD': 'Cardinal number',
        'DT': 'Determiner',
        'EX': 'Existential there',
        'FW': 'Foreign word',
        'IN': 'Preposition or subordinating conjunction',
        'JJ': 'Adjective',
        'JJR': 'Adjective, comparative',
        'JJS': 'Adjective, superlative',
        'LS': 'List item marker',
        'MD': 'Modal',
        'NN': 'Noun, singular or mass',
        'NNS': 'Noun, plural',
        'NNP': 'Proper noun, singular',
        'NNPS': 'Proper noun, plural',
        'PDT': 'Predeterminer',
        'POS': 'Possessive ending',
        'PRP': 'Personal pronoun',
        'PRP$': 'Possessive pronoun',
        'RB': 'Adverb',
        'RBR': 'Adverb, comparative',
        'RBS': 'Adverb, superlative',
        'RP': 'Particle',
        'SYM': 'Symbol',
        'TO': 'to',
        'UH': 'Interjection',
        'VB': 'Verb, base form',
        'VBD': 'Verb, past tense',
        'VBG': 'Verb, gerund or present participle',
        'VBN': 'Verb, past participle',
        'VBP': 'Verb, non-3rd person singular present',
        'VBZ': 'Verb, 3rd person singular present',
        'WDT': 'Wh-determiner',
        'WP': 'Wh-pronoun',
        'WP$': 'Possessive wh-pronoun',
        'WRB': 'Wh-adverb'
    }
    return tag_descriptions.get(tag, tag)

def filter_entities(entities, filter_type):
    filtered = []
    filter_type = filter_type.lower()
    
    for entity in entities:
        if isinstance(entity, nltk.Tree):
            if filter_type in entity.label().lower():
                filtered.append(entity)
        else:
            word, tag = entity
            tag_desc = get_tag_description(tag).lower()
            if filter_type in tag_desc:
                filtered.append(entity)
    
    return filtered

def process_transcript(file_path, filter_type=None):
    with open(file_path, "r") as file:
        transcript = file.read()
    
    tokens = nltk.word_tokenize(transcript)
    tagged = nltk.pos_tag(tokens)
    entities = nltk.chunk.ne_chunk(tagged)
    
    if filter_type:
        entities = filter_entities(entities, filter_type)
    
    return entities

# Main execution
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        
        print(f"Processing {filename}:")
        
        # Example: Filter for adjectives
        adjectives = process_transcript(file_path, "adjective")
        print("Adjectives:")
        for adj in adjectives:
            print(format_entity(adj))
        
        # Example: Filter for person names
        persons = process_transcript(file_path, "person")
        print("\nPerson names:")
        for person in persons:
            print(format_entity(person))
        
        # Example: Filter for all entities (no filter)
        all_entities = process_transcript(file_path)
        print("\nAll entities:")
        for entity in all_entities:
            print(format_entity(entity))
        
        print("\n" + "-"*40 + "\n")  # Separator between files


NLTK_DATA: /Users/admin/Documents/Academic/Fall 2024/CS 491-101/UFO-news/venv/nltk_data
Processing A one minute TEDx Talk for the digital age  Woody Roseland  TEDxMileHigh.txt:
Adjectives:
i (Adjective)
honest (Adjective, superlative)
internet (Adjective)
i (Adjective)
weird (Adjective)
little (Adjective)
dead (Adjective)
i (Adjective)
last (Adjective)
eighteen (Adjective)
ted (Adjective)
quick (Adjective)
final (Adjective)
expensive (Adjective)

Person names:
'm (Verb, non-3rd person singular present)
do (Verb, non-3rd person singular present)
you (Personal pronoun)
think (Verb, non-3rd person singular present)
do (Verb, non-3rd person singular present)
they (Personal pronoun)
they (Personal pronoun)
're (Verb, non-3rd person singular present)
get (Verb, non-3rd person singular present)
it (Personal pronoun)
it (Personal pronoun)
think (Verb, non-3rd person singular present)
's (Verb, 3rd person singular present)
get (Verb, non-3rd person singular present)
it (Personal pronoun)
they (

## Output:
```
Processing A one minute TEDx Talk for the digital age  Woody Roseland  TEDxMileHigh.txt:
Adjectives:
i (Adjective)
honest (Adjective, superlative)
internet (Adjective)
i (Adjective)
weird (Adjective)
little (Adjective)
dead (Adjective)
i (Adjective)
last (Adjective)
eighteen (Adjective)
ted (Adjective)
quick (Adjective)
final (Adjective)
expensive (Adjective)

Person names:
'm (Verb, non-3rd person singular present)
do (Verb, non-3rd person singular present)
you (Personal pronoun)
think (Verb, non-3rd person singular present)
do (Verb, non-3rd person singular present)
they (Personal pronoun)
...
n (Noun, singular or mass)

----------------------------------------
```
*Output is truncated*

# Genism
## *I am unable to get this code to run because of dependancy issues. Will likely have to create a seperate venv for this cell.*

In [None]:
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from numpy import triu
import os

def process_transcripts(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                documents.append(file.read())
    
    # Tokenize and preprocess the documents
    texts = [simple_preprocess(doc) for doc in documents]
    
    # Create a dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100)
    
    return lda_model

folder_path = "/Users/admin/Documents/Academic/Fall 2024/CS 491-101/transcripts"
model = process_transcripts(folder_path)

# Print the top 10 words for each topic
for idx, topic in model.print_topics(-1, 10):
    print(f"Topic: {idx}")
    print(topic)
    print()
