<a href="https://colab.research.google.com/github/ktynski/Marketing_Automations_Notebooks_With_GPT/blob/main/Automatic_Content_Gap_Report_and_Analysis_With_Clustering_and_Cluster_Descriptions_(Public).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai
!pip install transformers
!pip install kneed

In [None]:
import requests
import pandas as pd
import openai
import csv
from io import StringIO
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
import torch
import matplotlib.pyplot as plt
from kneed import KneeLocator
from transformers import GPT2Tokenizer


# Define API keys and endpoints
SEMRUSH_API_KEY = "Your SEMRush API Key"
openai.api_key = "Your Openai API Key"
SEMRUSH_API_ENDPOINT = "https://api.semrush.com/"



def analyze_user_website_content(domain, min_difficulty, max_difficulty, num_keywords, min_position, max_position):
    api_endpoint = "https://api.semrush.com/"
    params = {
        "type": "domain_organic",
        "key": SEMRUSH_API_KEY,
        "domain": domain,
        "export_columns": "Ph,Kd,Po,Ps",  # Include "Position" column
        "database": "us",
        "display_limit": num_keywords,
        "display_filter": f"+|Kd|Bt|{min_difficulty}|{max_difficulty};+|Ps|Bt|{min_position}|{max_position}"
    }
    response = requests.get(api_endpoint, params=params)
    csv_data = response.text
    print(csv_data)
    df = pd.read_csv(StringIO(csv_data), delimiter=';', names=["Keyword", "Keyword Difficulty", "Position"])
    # Convert the "Keyword Difficulty" and "Position" columns to float
    df["Keyword Difficulty"] = pd.to_numeric(df["Keyword Difficulty"], errors='coerce')
    df["Position"] = pd.to_numeric(df["Position"], errors='coerce')
    # Filter based on keyword difficulty and position range
    df_filtered = df[(df["Keyword Difficulty"] >= float(min_difficulty)) &
                     (df["Keyword Difficulty"] <= float(max_difficulty)) &
                     (df["Position"] >= float(min_position)) &
                     (df["Position"] <= float(max_position))]

    return df_filtered






def retrieve_top_ranking_websites(topic_area, num_websites):
    api_endpoint = "https://api.semrush.com/"
    params = {
        "type": "phrase_organic",
        "key": SEMRUSH_API_KEY,
        "phrase": topic_area,
        "export_columns": "Dn",
        "database": "us",
        "display_limit": num_websites
    }
    response = requests.get(api_endpoint, params=params)
    csv_data = response.text
    #print(csv_data)
    df = pd.read_csv(StringIO(csv_data), skiprows=1, names=["Domain"])
    #print("Top-ranking competitor domains:")
    #print(df)
    return df

def identify_content_gaps(user_domain, topic_area, min_difficulty, max_difficulty, num_websites, num_keywords, user_position_range, competitor_position_range):
    user_min_position, user_max_position = user_position_range
    user_keywords_df = analyze_user_website_content(user_domain, min_difficulty, max_difficulty, num_keywords, user_min_position, user_max_position)
    user_keywords_df.to_csv('Keywords_from_user_domain.csv')
    competitor_domains_df = retrieve_top_ranking_websites(topic_area, num_websites)
    competitor_domains_df.to_csv('Keywords_from_competitor_domains.csv')
    competitor_keywords = set()
    competitor_min_position, competitor_max_position = competitor_position_range
    for _, row in competitor_domains_df.iterrows():
        domain_keywords_df = analyze_user_website_content(row["Domain"], min_difficulty, max_difficulty, num_keywords, competitor_min_position, competitor_max_position)
        competitor_keywords.update(domain_keywords_df["Keyword"].tolist())
    #print(competitor_keywords)
    content_gaps = competitor_keywords - set(user_keywords_df["Keyword"].tolist())
    #print("Content Gaps:")
    #print(content_gaps)
    return content_gaps







def generate_embeddings(keywords, model_name="sentence-transformers/bert-base-nli-mean-tokens"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    embeddings = []
    for keyword in keywords:
        inputs = tokenizer(keyword, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

def perform_clustering(embeddings):
    cluster_range = range(10, 40)
    inertia_values = []
    for n_clusters in cluster_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        kmeans.fit(embeddings)
        inertia_values.append(kmeans.inertia_)

    # Use the KneeLocator to automatically detect the elbow point
    knee_locator = KneeLocator(cluster_range, inertia_values, curve="convex", direction="decreasing", S=0.3)
    n_clusters_optimal = knee_locator.elbow

    # Perform clustering with the optimal number of clusters
    kmeans_optimal = KMeans(n_clusters=n_clusters_optimal, random_state=0).fit(embeddings)
    return kmeans_optimal.labels_




def describe_clusters(clustered_keywords_df):
    cluster_descriptions = {}
    for cluster_id, group in clustered_keywords_df.groupby('Cluster'):
        # Get the keywords for the current cluster
        keywords = group['Keyword'].tolist()

        # Convert the keywords into a comma-separated string
        keywords_str = ', '.join(keywords)

        # Set the instructions for GPT-3
        instructions = f"Describe the topics related to the following keywords: {keywords_str}"

        # Make an API call to GPT-3
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an AI language model."},
                {"role": "assistant", "content": "Provide a description for the given cluster. It is possible that some clusters have keywords that dont really seem to fit. Your job is to find the primary themes and disregard keywords that dont seem to fit and generate the description based on the keywords that most fit."},
                {"role": "user", "content": instructions}
            ],
            temperature=.5,
            max_tokens=256,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )

        # Extract the description from the GPT-3 response
        description = response["choices"][0]["message"]["content"].strip()

        # Store the description in the dictionary
        cluster_descriptions[cluster_id] = description

    return cluster_descriptions

def create_clustered_keywords_df(keywords, cluster_assignments):
    # Create a DataFrame with the keywords and their corresponding cluster assignments
    df = pd.DataFrame({'Keyword': keywords, 'Cluster': cluster_assignments})
    return df


def generate_cluster_analysis_report(keywords,user_domain):
    # Prepare the input message for GPT-3

    input_message = f"You are an expert SEO analyst with extensive experience in analyzing keyword groups and making recommendations based on keyword gap analysis data which will be provided. and creating highly informative and actionable SEO keyword gap and content strategy reports for clients. I have conducted a content gap analysis for a brand and identified a list of keywords that represent potential content opportunities. I would like you to provide an overall analysis and breakdown of the keyword clusters based on their topical and semantic relevance. Please create a comprehensive SEO report that includes the following sections:\n 1. Introduction: Provide an overview of the content gap analysis and its importance for the brand.\n 2. Keyword Clusters: Identify and describe the main topical clusters present in the list of keywords. Give a few representative keywords but dont list too many. Explain the relevance and potential impact of each cluster for the brand's SEO strategy.\n 3. Recommendations: Provide specific content creation and optimization recommendations for each keyword cluster. Explain how these recommendations can help the brand improve its search visibility and attract more organic traffic.\n 4. Conclusion: Summarize the key findings and recommendations of the report. And give any/all additional thoughts that might be useful or helpful based on the learnings from the gap analysis.\n Please present the information in a clear and professional manner, as though it were a client SEO report. Please also at the end provide a list of 25 article idea titles that would help most in improving where there are gaps and also would make useful and interesting articles that spark curiosity. Here is the list of keywords from the content gap analysis:\n {', '.join(keywords)}\n"






    # Make the GPT-3 API call
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": input_message},
            {"role": "user", "content": f"Please think step by step and provide the most useful, detailed, thorough, and informative possible report. Please write each section and then put them all together to get somewhere around 1000 words if possible. Do not include the list of keywords found in the gap analysis individually, this takes up too much room. Keep in mind the domain you are giving recommendations for which is {user_domain}. the recommendations should make sense given this domain and not include recommendations that seem out of scope for the domain."}
        ],
        temperature=0.7,
        max_tokens=3000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    # Extract the generated report from the response
    report = response["choices"][0]["message"]["content"].strip()
    report = report.strip()

    # Save the report to a text file
    with open("cluster_analysis_report.txt", "w") as file:
        file.write(report)

    # Print the report
    print(report)


def truncate_keywords_to_tokens(keywords_list, max_tokens):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    truncated_keywords = []
    current_tokens = 0
    for keyword in keywords_list:
        keyword_tokens = tokenizer.tokenize(keyword)
        if current_tokens + len(keyword_tokens) <= max_tokens:
            truncated_keywords.append(keyword)
            current_tokens += len(keyword_tokens)
        else:
            break
    return truncated_keywords







user_domain = "https://warbirdfishinggear.com/"               #input("Please enter your website domain: ")

topic_area = "fishing gear"             #input("Please enter the topic area you want to analyze ")

min_difficulty = 0                      #input("Please enter the minimum keyword difficulty (0-100) ")

max_difficulty = 100                    #input("Please enter the maximum keyword difficulty (0-100) ")

num_websites = 4                        #input("Enter the number of competitor websites to consider ")

num_keywords = 500                 #input("Enter the number of keywords to consider from each competitor")

user_position_range = (20, 100)         # Position range for the user's site

competitor_position_range = (1, 20)     # Position range for the competitors' sites



# Find the content gaps between your domain and your competitors domains
content_gaps = identify_content_gaps(user_domain, topic_area, min_difficulty, max_difficulty, num_websites, num_keywords, user_position_range, competitor_position_range)
print(content_gaps)

# Convert the content_gaps set to a list
content_gaps_list = list(content_gaps)

# Truncate to make sure it can fit in the GPT context window
max_tokens = 4400
content_gaps_list = truncate_keywords_to_tokens(content_gaps_list, max_tokens)


#Generate a content gap report authored by GPT
generate_cluster_analysis_report(content_gaps_list, user_domain)

# Generate embeddings for the keywords (use the appropriate function to generate embeddings)
embeddings = generate_embeddings(content_gaps_list)

# Perform clustering on the embeddings (use the appropriate function to perform clustering)
cluster_assignments = perform_clustering(embeddings)

# Create the DataFrame with keywords and cluster assignments
clustered_keywords_df = create_clustered_keywords_df(content_gaps_list, cluster_assignments)

# Get unique cluster IDs
unique_cluster_ids = clustered_keywords_df['Cluster'].unique()

# Generate descriptions for each cluster
cluster_descriptions = describe_clusters(clustered_keywords_df)

# Add cluster descriptions to the DataFrame for each keyword
clustered_keywords_df['Cluster Description'] = clustered_keywords_df['Cluster'].map(cluster_descriptions)

# Print the DataFrame
print(clustered_keywords_df)

# Save Clustered and Labeled dataset:
clustered_keywords_df.to_csv("Content_Gap_Keywords_Clustered.csv")









In [None]:
clustered_keywords_df