In [4]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Download WordNet data if not already downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

def generate_similar_words(keyword):
    # Initialize WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Find synonyms using WordNet
    synonyms = set()
    for syn in wordnet.synsets(keyword):
        for lemma in syn.lemmas():
            # Exclude compound words or phrases
            if "_" not in lemma.name() and len(lemma.name().split()) == 1:
                synonyms.add(lemma.name())
    
    # Generate morphological variations
    variations = set([
        keyword,  # Original word
        lemmatizer.lemmatize(keyword, pos='n'),  # Lemma (noun)
        lemmatizer.lemmatize(keyword, pos='v'),  # Lemma (verb)
        lemmatizer.lemmatize(keyword, pos='a'),  # Lemma (adjective)
        f"{keyword}s",  # Plural
        f"{keyword}ed",  # Past tense
        f"{keyword}ing"  # Present participle
    ])
    
    # Combine and deduplicate
    similar_words = sorted(synonyms.union(variations))
    return similar_words

# Example usage
keyword = "admission"
similar_words = generate_similar_words(keyword)
print(similar_words)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lindaliang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/lindaliang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['access', 'accession', 'admission', 'admissioned', 'admissioning', 'admissions', 'admittance', 'entree']


In [5]:
similar_words

['access',
 'accession',
 'admission',
 'admissioned',
 'admissioning',
 'admissions',
 'admittance',
 'entree']

In [6]:
!pip install openai



In [14]:
pip install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [16]:
import openai
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key = openai.api_key)

def generate_summary_for_topics(topics: dict) -> dict:
    """
    Generates one-word summaries for the given topics using OpenAI API (GPT-4).
    
    Args:
        topics (dict): Dictionary with topic names and their top words.
    
    Returns:
        dict: Dictionary with topic names and their one-word summaries.
    """
    # Create a prompt with all the topics and their words
    prompt = "For each of the following topics, summarize the key idea in one word:\n\n"
    for topic, words in topics.items():
        prompt += f"{topic}: {', '.join(words)}\n"
    
    try:
        # Make the API call to OpenAI (using GPT-4)
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=50,  # Limit response length for better formatting
            temperature=0.5  # Adjust creativity
        )
        
        # Extract the response
        summary = response.choices[0].message.content
        
        # Parse the summary into a dictionary
        summarized_topics = {}
        topic_summaries = summary.split("\n")
        for line in topic_summaries:
            if ':' in line:
                topic_name, topic_summary = line.split(":", 1)
                summarized_topics[topic_name.strip()] = topic_summary.strip()
        
        return summarized_topics

    except Exception as e:
        print(f"Error generating summary: {e}")
        return {}

# Example usage
topics = {
    "Topic 1": ["data", "analysis", "machine", "learning", "algorithm", "model", "predict", "accuracy", "statistics", "performance"],
    "Topic 2": ["hospital", "patient", "care", "health", "medical", "treatment", "doctor", "nurse", "clinic", "hospitalization"],
    "Topic 3": ["sports", "team", "soccer", "goal", "match", "players", "tactics", "league", "competition", "performance"],
    "Topic 4": ["education", "university", "learning", "student", "campus", "teacher", "classroom", "course", "degree", "professor"],
    "Topic 5": ["finance", "economy", "stock", "investment", "market", "business", "capital", "growth", "interest", "risk"]
}

summarized_topics = generate_summary_for_topics(topics)
print(summarized_topics)

{'Topic 1': 'Analytics', 'Topic 2': 'Healthcare', 'Topic 3': 'Sports', 'Topic 4': 'Education', 'Topic 5': 'Finance'}


# From here is analysis on a specific post

In [43]:
import praw
import pandas as pd
import os
from dotenv import load_dotenv

def get_api_data(subreddit_name, search_keyword, limit=1000):
    """
    Function to fetch Reddit posts from a given subreddit based on a search keyword.

    Parameters:
    - subreddit_name (str): The name of the subreddit to search in.
    - search_keyword (str): The keyword to search for in the subreddit.
    - limit (int, optional): The maximum number of posts to fetch. Default is 1000.

    Returns:
    - pd.DataFrame: A DataFrame containing post data (Title, Score, URL, Created, Subreddit, Text).
    """
    # Load environment variables
    load_dotenv()

    # Set up Reddit API client
    reddit = praw.Reddit(
        client_id=os.environ.get("REDDIT_CLIENT_ID"),
        client_secret=os.environ.get("REDDIT_CLIENT_SECRET"),
        user_agent='your_user_agent'
    )

    # Search for posts in the specified subreddit with the given keyword
    posts = reddit.subreddit(subreddit_name).search(search_keyword, sort='relevance', limit=limit)

    # Create an empty list to store post data
    post_data = []

    # Extract relevant data from each post
    for post in posts:
        post_data.append({
            'Title': post.title,
            'Score': post.score,
            'URL': post.url,
            'Created': post.created_utc,
            'Subreddit': post.subreddit.display_name,
            'Text': post.selftext
        })

    # Convert the list of post data to a DataFrame
    return pd.DataFrame(post_data)

In [44]:
df = get_api_data('Northwestern', 'mlds', limit=1000)
df.head()

Unnamed: 0,Title,Score,URL,Created,Subreddit,Text
0,Confused between MSDS & MLDS,2,https://www.reddit.com/r/Northwestern/comments...,1730719000.0,Northwestern,Why do they have two different programs for a ...
1,Honest Review of Northwestern MS in Machine Le...,32,https://www.reddit.com/r/Northwestern/comments...,1713151000.0,Northwestern,Tldr: its pretty overrated and students are n...


In [45]:
from transformers import pipeline

def summarize_first_row(df):
    # Initialize the summarizer
    summarizer = pipeline("summarization", model="t5-large",tokenizer="t5-large")

    # Filter rows where length of Text > 1000
    filtered_df = df[df['Text'].str.len() > 1000]

    # Sort the DataFrame by Score in descending order
    sorted_df = filtered_df.sort_values(by='Score', ascending=False)

    # Get the Text from the first row
    text_to_summarize = sorted_df.iloc[0]['Text']

    # Generate the summary of the text
    summary = summarizer(text_to_summarize, max_length=150, min_length=50, do_sample=False)

    return summary[0]['summary_text']

# Example usage:
# Assuming you already have your DataFrame `df` loaded with the required columns
summary = summarize_first_row(df)
print("Summary of the first row:")
print(summary)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Summary of the first row:
the program director, Diego klabjan, is very condescending and makes decisions on a whim . the program does not really do anything to help you with getting a job . be prepared to apply to hundreds of jobs and internships online .


In [46]:
from transformers import pipeline
import nltk

def sentiment_analysis_by_paragraph(df):
    # Filter rows where length of Text > 1000
    filtered_df = df[df['Text'].str.len() > 1000]

    # Sort the DataFrame by Score in descending order
    sorted_df = filtered_df.sort_values(by='Score', ascending=False)

    # Get the Text from the first row
    text = sorted_df.iloc[0]['Text']
    
    # Load the pre-trained emotion classification model
    emotion_model = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

    # Define color mapping for emotions
    emotion_colors = {
        'anger': 'red',
        'disgust': 'green',
        'fear': 'orange',
        'joy': 'yellow',
        'love': 'pink',
        'sadness': 'blue',
        'surprise': 'purple',
        'neutral': 'gray'  # Add neutral as gray for neutral emotions
    }


    # Chunk the text into paragraphs
    nltk.download('punkt')
    paragraphs = text.split('\n')

    # Remove empty paragraphs (if any)
    paragraphs = [p for p in paragraphs if p.strip()]

    # Perform emotion detection on each paragraph
    chunk_emotions = []
    for chunk in paragraphs:
        emotions = emotion_model(chunk)
        chunk_emotions.append((chunk, emotions[0]['label']))  # Save the paragraph and its emotion label

    # Generate HTML with highlighted paragraphs
    html_output = "<html><body>"

    for idx, (para, emotion) in enumerate(chunk_emotions):
        color = emotion_colors.get(emotion.lower(), 'gray')  # Default to gray if emotion not found
        html_output += f'<p style="color:{color};"><b>Paragraph {idx+1} ({emotion}):</b><br>{para}</p>'

    html_output += "</body></html>"

    # Save to an HTML file or display in Jupyter
    with open("highlighted_paragraphs.html", "w") as f:
        f.write(html_output)

    # If you're in a Jupyter notebook, you can render it directly
    from IPython.core.display import display, HTML
    display(HTML(html_output))
    
sentiment_analysis_by_paragraph(df)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lindaliang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from IPython.core.display import display, HTML


# test runs

In [35]:
from transformers import pipeline

# Load the pre-trained emotion classification model
emotion_model = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

# Chunk the text (e.g., into sentences)
import nltk
nltk.download('punkt')
sentences = nltk.sent_tokenize(text)

# Step 1: Split text by newline characters into paragraphs
paragraphs = text.split('\n')

# Step 2: Remove empty paragraphs (if there are any)
paragraphs = [p for p in paragraphs if p.strip()]

# Step 3: Define chunk size (number of paragraphs per chunk)
chunk_size = 1 

# Step 4: Split paragraphs into chunks
chunks = [paragraphs[i:i + chunk_size] for i in range(0, len(paragraphs), chunk_size)]

# Perform emotion detection on each chunk
chunk_emotions = []
for chunk in chunks:
    chunk_text = ' '.join(chunk)
    emotions = emotion_model(chunk_text)
    chunk_emotions.append(emotions)

# Output emotions for each chunk
for idx, emotions in enumerate(chunk_emotions):
    print(f"Paragraph {idx+1}: {emotions}")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lindaliang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Paragraph 1: [{'label': 'neutral', 'score': 0.49462947249412537}]
Paragraph 2: [{'label': 'neutral', 'score': 0.5831292271614075}]
Paragraph 3: [{'label': 'sadness', 'score': 0.5691455006599426}]
Paragraph 4: [{'label': 'fear', 'score': 0.8631186485290527}]
Paragraph 5: [{'label': 'neutral', 'score': 0.78631991147995}]
Paragraph 6: [{'label': 'fear', 'score': 0.3016975224018097}]
Paragraph 7: [{'label': 'neutral', 'score': 0.7004575729370117}]
Paragraph 8: [{'label': 'disgust', 'score': 0.33836647868156433}]
Paragraph 9: [{'label': 'neutral', 'score': 0.8757570385932922}]
Paragraph 10: [{'label': 'neutral', 'score': 0.9419630765914917}]
Paragraph 11: [{'label': 'neutral', 'score': 0.9168189167976379}]
Paragraph 12: [{'label': 'disgust', 'score': 0.6286032199859619}]
Paragraph 13: [{'label': 'neutral', 'score': 0.9650899171829224}]
Paragraph 14: [{'label': 'neutral', 'score': 0.8378678560256958}]
Paragraph 15: [{'label': 'disgust', 'score': 0.4352704882621765}]
Paragraph 16: [{'label': 

# Different paragraph chunking

In [39]:
from transformers import pipeline
import nltk

# Load the pre-trained emotion classification model
emotion_model = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

# Define color mapping for emotions
emotion_colors = {
    'anger': 'red',
    'disgust': 'green',
    'fear': 'orange',
    'joy': 'yellow',
    'love': 'pink',
    'sadness': 'blue',
    'surprise': 'purple',
    'neutral': 'gray'
}

# Chunk the text into paragraphs
nltk.download('punkt')
# Split the text into paragraphs
paragraphs = text.split('\n')

# Remove empty paragraphs (if any)
paragraphs = [p for p in paragraphs if p.strip()]

# Merge paragraphs with fewer than 20 words with the next paragraph
merged_paragraphs = []
i = 0
while i < len(paragraphs):
    # Check if the paragraph has fewer than 25 words and is not the last one
    if len(paragraphs[i].split()) < 25 and i < len(paragraphs) - 1:
        # Merge with the next paragraph
        merged_paragraphs.append(paragraphs[i] + ' ' + paragraphs[i + 1])
        i += 2  # Skip the next paragraph as it's merged
    else:
        # Keep the paragraph as is
        merged_paragraphs.append(paragraphs[i])
        i += 1

# Perform emotion detection on each merged paragraph
chunk_emotions = []
for chunk in merged_paragraphs:
    emotions = emotion_model(chunk)
    chunk_emotions.append((chunk, emotions[0]['label']))  # Save the paragraph and its emotion label

# Generate HTML with highlighted paragraphs
html_output = "<html><body>"

for idx, (para, emotion) in enumerate(chunk_emotions):
    color = emotion_colors.get(emotion.lower(), 'gray')  # Default to gray if emotion not found
    html_output += f'<p style="color:{color};"><b>Paragraph {idx+1} ({emotion}):</b><br>{para}</p>'

html_output += "</body></html>"

# Save to an HTML file or display in Jupyter
with open("highlighted_paragraphs.html", "w") as f:
    f.write(html_output)

# If you're in a Jupyter notebook, you can render it directly
from IPython.core.display import display, HTML
display(HTML(html_output))

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lindaliang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from IPython.core.display import display, HTML


In [34]:
pip install deepmoji

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: Could not find a version that satisfies the requirement deepmoji (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for deepmoji[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [33]:
from deepmoji import DeepMoji
from deepmoji.tokenizer import tokenizer

# Initialize the DeepMoji model
model = DeepMoji.load_pretrained()

# Tokenize and classify a chunk of text
tokens = tokenizer.encode(text)
emotion_scores = model.predict([tokens])

# Output the detailed emotion scores
print(emotion_scores)

ModuleNotFoundError: No module named 'deepmoji'