<a href="https://colab.research.google.com/github/ktynski/Marketing_Automations_Notebooks_With_GPT/blob/main/Automatic_Brand_or_Entity_News_Media_Monitoring_and_Analysis_by_Kristin_frac_tl_(public).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Required Libraries

In [None]:
!pip install sentence-transformers
!pip install scikit-learn
!pip install google-search-results-serpapi
!pip install newspaper3k
!pip install google-search-results
!pip install openai
!pip install yellowbrick

## Run the script below to generate a dataframe and csv of the article analysis and clustering of the news articles found. Replace your keyword, Serpapi key, and Openai api key

In [None]:
import concurrent.futures
import json
import os
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor

import nltk
import openai
import pandas as pd
import requests
from newspaper import Article
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from transformers import GPT2Tokenizer
from yellowbrick.cluster import KElbowVisualizer

nltk.download("punkt")


serpapi_key = 'Your SerpAPI Key'
openai.api_key = 'Your OpenAI Api Key'




# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def truncate_text(text, max_tokens):
    """
    Truncate the input text to the specified maximum number of tokens using the GPT-2 tokenizer.

    Args:
        text (str): The input text to be tokenized and truncated.
        max_tokens (int): The maximum number of tokens to keep.

    Returns:
        str: The truncated text.
    """
    # Tokenize the text and get the token IDs
    token_ids = tokenizer.encode(text)

    # Truncate the token IDs if they exceed the maximum number of tokens
    if len(token_ids) > max_tokens:
        token_ids = token_ids[:max_tokens]

    # Convert the truncated token IDs back to text
    truncated_text = tokenizer.decode(token_ids)
    return truncated_text





def get_google_news_data(query, num_results=100):
    params = {
        "api_key": serpapi_key,
        "engine": "google",
        "q": query,
        "tbm": "nws",
        "num": num_results
    }
    response = requests.get('https://serpapi.com/search.json', params=params)
    data = json.loads(response.text)

    if 'news_results' in data:
        articles = []
        for result in data['news_results']:
            articles.append({
                'title': result['title'],
                'link': result['link'],
                'date': result['date'],
                'source': result['source']
            })
        return articles
    else:
        print("No news results found.")
        return []

def fetch_articles(brand,num_results, num_clusters=30):
    query = brand
    num_results = num_results
    language = "en"

    articles = get_google_news_data(query, num_results)
    embeddings, article_texts = generate_embeddings(articles)
    clusters = cluster_articles(embeddings, num_clusters)
    article_list = []
    for i, article in enumerate(articles):
        article_dict = {
            'brand': brand,
            'title': article['title'],
            'link': article['link'],
            'date': article['date'],
            'source': article['source'],
            'text': scrape_article(article['link']),
            'cluster': clusters[i]
        }
        article_list.append(article_dict)
    df = pd.DataFrame(article_list)
    return df


def generate_embeddings(articles):
    model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
    article_texts = [scrape_article(article.get('link', '')) for article in articles]
    embeddings = model.encode(article_texts)
    return embeddings, article_texts

def cluster_articles(embeddings, num_clusters=30):
    kmeans = KMeans(n_clusters=num_clusters)
    clusters = kmeans.fit_predict(embeddings)
    return clusters

def find_optimal_clusters(embeddings, max_clusters=30):
    visualizer = KElbowVisualizer(KMeans(), k=(2, max_clusters + 1), metric='silhouette')
    visualizer.fit(embeddings)
    return visualizer.elbow_value_

def scrape_article(url):
    article = Article(url)
    try:
        article.download()
        article.parse()
    except Exception as e:
        print(f"Failed to Download Article: {e}")
    return article.title + " " + article.text

def analyze_articles(df):
    with ThreadPoolExecutor() as executor:
        futures = []
        for i, row in df.iterrows():
            time.sleep(2)
            article_text = row["text"]
            article_text = truncate_text(article_text, 2000)
            prompt = (
              f"Please analyze this news article and provide a comprehensive summary based on the following categories. Please answer every part of the following:\n\n"
              f"Main Themes: Identify the central topics discussed in the article.\n"
              f"Narratives: Describe any storylines or overarching messages present in the article.\n"
              f"Opinions: Mention the key viewpoints or perspectives expressed in the article, along with their sources (if mentioned).\n"
              f"Spokespersons: List any individuals or organizations mentioned as sources, along with their roles or affiliations.\n"
              f"Biases: Point out any potential biases in the article, whether it's through language, perspective, or focus.\n"
              f"Article Emotion: Determine the dominant emotion(s) conveyed by the article (e.g., positive, negative, neutral, etc.).\n\n"
              f"Please provide your analysis in a well-structured and concise format. Use bullet points or numbered lists to make your response easier to read and understand.\n\n"
              f"This is the News Article to evaluate. Only provide the requested data and nothing else before Main Themes: \n\n {article_text}"
          )


            future = executor.submit(
                openai.Completion.create,
                model="text-davinci-003",
                prompt=prompt,
                max_tokens=1800,
                n=1,
                stop=None,
                temperature=0.7,
            )
            futures.append((i, future))

        for i, future in futures:
            retries = 0
            max_retries = 3
            while retries < max_retries:
                try:
                    response = future.result()
                    output_text = response.choices[0].text.strip()
                    output_list = output_text.split("\n\n")
                    parsed_data = {}
                    for item in output_list:
                        key, value = item.split(":", 1)
                        parsed_data[key.strip()] = value.strip()

                    # Check if all required keys are present in the parsed_data dictionary
                    required_keys = ["Main Themes"]
                    if all(key in parsed_data for key in required_keys):
                        df.loc[i, "main_themes"] = parsed_data.get("Main Themes", "")
                        df.loc[i, "narratives"] = parsed_data.get("Narratives", "")
                        df.loc[i, "opinions"] = parsed_data.get("Opinions", "")
                        df.loc[i, "spokespersons"] = parsed_data.get("Spokespersons", "")
                        df.loc[i, "biases"] = parsed_data.get("Biases", "")
                        df.loc[i, "emotion"] = parsed_data.get("Article Emotion", "")
                        break
                    else:
                        retries += 1
                        print(f"Incomplete data for row {i}, retrying {retries}/{max_retries}...")
                        future = executor.submit(
                            openai.Completion.create,
                            model="text-davinci-003",
                            prompt=prompt,
                            max_tokens=1800,
                            n=1,
                            stop=None,
                            temperature=0.5,
                        )
                except:
                    print("failed")
                    break
    return df

num_results = 10
topic = "OpenAI"
df = fetch_articles(topic, num_results, num_clusters=10)
df = analyze_articles(df)
df.to_csv("Media_Monitoring_Report.csv")


## Print Dataframe

In [None]:
df

## Run Media Monitoring Report on Data from the Last Step

In [None]:
import pandas as pd
import openai


def describe_clusters(df):
    cluster_descriptions = {}

    for cluster_id in df['cluster'].unique():
        titles = df[df['cluster'] == cluster_id]['title'].tolist()
        titles_str = "\n".join(titles)

        prompt = (
            f"Please provide a brief description of the main topic or theme for the following list of article titles:\n\n"
            f"{titles_str}\n\n"
            f"Write a one-sentence description summarizing the main topic or theme."
        )

        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            max_tokens=50,
            n=1,
            stop=None,
            temperature=0.5,
        )

        output_text = response.choices[0].text.strip()
        cluster_descriptions[cluster_id] = output_text

    return cluster_descriptions





def generate_report(most_common, cluster_counts, emotion_by_cluster, spokesperson_counts, bias_counts):
    prompt = (
        f"Please generate a media monitoring report based on the following summary data:\n\n"
        f"Most Common Information:\n{most_common}\n\n"
        f"Article Distribution Among Clusters:\n{cluster_counts}\n\n"
        f"Article Emotions Distribution Across Clusters:\n{emotion_by_cluster}\n\n"
        f"Most Frequently Mentioned Spokespersons:\n{spokesperson_counts}\n\n"
        f"Most Biased Articles or Sources:\n{bias_counts}\n\n"
        f"Write a well-structured and concise report summarizing the key findings from the provided data."
    )

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages = [
        {"role": "system", "content": "Please simulate an expert at media analysis with a strong background in psychology and human behavior who is a world expert at PR"},
        {"role": "user", "content": prompt}],
        max_tokens=1024,
        n=1,
        stop=None,
        temperature=0.5,
    )

    output_text = response['choices'][0]['message']['content'].strip()
    return output_text

def analyze_dataframe(df):
    # Extract summary information
    most_common = df['main_themes'].value_counts().head(5).to_dict()
    cluster_counts = df['cluster'].value_counts().to_dict()
    emotion_by_cluster = df.groupby('cluster')['emotion'].value_counts().unstack(fill_value=0).to_dict()
    spokesperson_counts = df['spokespersons'].value_counts().head(5).to_dict()
    bias_counts = df['biases'].value_counts().head(5).to_dict()

    # Generate the GPT-based report
    report = generate_report(most_common, cluster_counts, emotion_by_cluster, spokesperson_counts, bias_counts)
    print("\nGPT-Generated Media Monitoring Report:")
    print(report)

    # Describe the clusters based on article titles
    cluster_descriptions = describe_clusters(df)
    print("\nCluster Descriptions:")
    for cluster_id, description in cluster_descriptions.items():
        print(f"Cluster {cluster_id}: {description}")

# Load the DataFrame from the CSV file generated by the original script
df = deduplicated_df = df.drop_duplicates(subset='Hotels')
df = pd.read_csv("Media_Monitoring_Report.csv")
analyze_dataframe(df)
