In [None]:
import pandas as pd 


vectored_data = pd.read_csv('frog_vectored_rent.csv')


def scann_search(dataset:np.ndarray, queries: np.ndarray, n_neighbors = 10, distance_measure = "dot_product", num_leaves = 2000, num_leaves_to_search = 100):
  normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]

  searcher = scann.scann_ops_pybind.builder(normalized_dataset, n_neighbors, distance_measure).tree(
      num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search, training_sample_size=250000).score_ah(
      2, anisotropic_quantization_threshold=0.2).reorder(100).build()

  return searcher

def convert_scann_arrays_to_urls(arrays: np.array, df: pd.DataFrame,column):
    results = []
    for arr in arrays:
      results.append(df.iloc[arr.flatten()][column].tolist())
    return results

siteDf = siteDf[siteDf['openAiEmbeddings'].isna() == False]
siteDf['openAiEmbeddingsAsFloats'] = siteDf['openAiEmbeddings'].str.split(',')
siteDf['openAiEmbeddingsAsFloats'] = siteDf['openAiEmbeddingsAsFloats'].apply(lambda x: np.float64(x))
siteDf['EmbeddingLength'] = siteDf['openAiEmbeddingsAsFloats'].apply(lambda x: x.size)

if siteDf['EmbeddingLength'].unique().size == 1:
  d = siteDf['EmbeddingLength'].unique() #Number of dimensions for each value
else:
  print('Dimensionality reduction required to make all arrays the same size.')

dataset = np.vstack(vectored_data['openAiEmbeddingsAsFloats'].values)
queries = dataset

siteSearcher = scann_search(dataset, queries)
siteSearcher.serialize(index_directory+'/site_scann_index')

In [None]:
# Create embeddings for SEMRUSH keyword to comapare 1-1 with our data


# Function to get embeddings and flatten them for SCANN
def get_openai_embeddings(keyword):
    response = openai.embeddings.create(
        input=keyword,
        model="text-embedding-3-small"  # Make sure to use the same embeddings as Screaming Frog
    )
    # Extract and flatten the embedding
    embedding_vector = response.data[0].embedding
    return np.array(embedding_vector).flatten()

semrushFile = 'semrush_keywords.csv'
keywordDf = read_file(semrushFile, 'CSV')
display(keywordDf)

# Loop through the DataFrame and get embeddings for each keyword
embeddings = []
for keyword in keywordDf['Keyword']:
    embeddings.append(get_openai_embeddings(keyword))

keywordDf['embeddings'] = embeddings

# Create a temporary DataFrame for Excel output with embeddings converted to strings
tempDf = keywordDf.copy()
tempDf['embeddings'] = tempDf['embeddings'].apply(lambda x: str(x))
tempDf.to_excel('semrush-embeddings.xlsx', index=False) # Save with embeddings as strings

# Display the updated DataFrame
print(keywordDf.head())

In [None]:
# keyword-to-keyword and keyword-to-page relationships 
# Wherever the highest ranking URL does not match the current landing page, that’s a linking opportunity for optimization.

queries = np.vstack(keywordDf['embeddings'].values) #Stacking all individual embeddings vertically into matrix

kwSearcher = scann_search(dataset, queries) # dataset is the same as before
nearest_neighbors = kwSearcher.search_batched(queries, final_num_neighbors=1)
matched_urls = convert_scann_arrays_to_urls(nearest_neighbors, siteDf, 'Address')

keywordDf['BestMatchURL'] = convert_scann_arrays_to_urls(neighbors, siteDf, 'Address')
keywordDf['BestMatchURL'] = keywordDf['BestMatchURL'].apply(lambda x: x[:1][0])

display(keywordDf)

In [None]:
# Function to normalize embeddings
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return (embeddings / norms).tolist()  # Normalize and convert to list

# Normalize the embeddings and convert them to lists for DataFrame storage
keywordDf['NormalizedEmbeddings'] = normalize_embeddings(np.vstack(keywordDf['embeddings'].values))
siteDf['NormalizedEmbeddings'] = normalize_embeddings(np.vstack(siteDf['OpenAI Embeddings 1ConvertedFloats'].values))

# Function to calculate cosine similarity
def cosine_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2)

# Initialize a list to store cosine similarity results
cosine_similarities = []
relevance_values = []

# Loop through each keyword to calculate cosine similarity with its corresponding URL in siteDf
for index, row in keywordDf.iterrows():
    keyword_url = row['URL']
    keyword_embedding = row['NormalizedEmbeddings']  # This is now a list

    # Find the corresponding URL in siteDf
    if keyword_url in siteDf['Address'].values:
        # Get the embedding for the matching URL, which is also stored as a list
        url_embedding = siteDf.loc[siteDf['Address'] == keyword_url, 'NormalizedEmbeddings'].iloc[0]
        # Convert list to numpy array for calculation
        similarity = cosine_similarity(np.array(keyword_embedding), np.array(url_embedding))
        relevance = similarity * 100
    else:
        similarity = None  # Set similarity to None if no matching URL is found
        relevance = None

    cosine_similarities.append(similarity)
    relevance_values.append(relevance)

# Store the cosine similarities in the keywordDf
keywordDf['CosineSimilarity'] = cosine_similarities
keywordDf['Relevance'] = relevance_values


# Display or use the updated DataFrame
print(keywordDf[['Keyword', 'URL', 'CosineSimilarity','Relevance']])

keywordDf.to_excel('keyword-relevance.xlsx')

In [None]:
# Internal Linking 


# Search siteDf for keywords, return 10 neighbors per keyword

queries = np.vstack(keywordDf['embeddings'].values) #Stacking all individual embeddings vertically into matrix

kwSearcher = scann_search(dataset, queries) # dataset is the same as before
neighbors, distances = siteSearcher.search_batched(queries, leaves_to_search = 150)

nearest_neighbors = kwSearcher.search_batched(queries, final_num_neighbors=5)

matched_urls = convert_scann_arrays_to_urls(nearest_neighbors, siteDf, 'Address')

keywordDf['InternalLinkSuggestions'] = convert_scann_arrays_to_urls(neighbors, siteDf, 'Address')
keywordDf['InternalLinkSuggestions'] = keywordDf['InternalLinkSuggestions'].apply(lambda x: x[1:])
display(keywordDf)

# Create a temporary DataFrame for Excel output with embeddings converted to strings
tempDf = keywordDf.copy()
tempDf['embeddings'] = tempDf['embeddings'].apply(lambda x: str(x))
tempDf.to_excel('keyword-internal-link-mapping.xlsx', index=False) # Save with embeddings as strings

In [None]:
# Extra inclusion 

# Doing another crawl of screaming frogs with:: 
#//return seoSpider.data(document.body.innerText); //#
# we can merge with original dataset to understand how often same topics are 


In [None]:
def cluster_and_visualize_content(df, embeddings_col):

    # Prepare data
    df['Page Content'] = df['Page Content'].astype(str)
    keywords = df['Page Content'].tolist()
    embeddings = np.vstack(df[embeddings_col].tolist())  # Ensure embeddings are properly shaped
    embeddings = normalize(embeddings)  # Normalize embeddings for cosine similarity

    prompt = """
      I have topic that is described by the following keywords: [KEYWORDS]
      I am attempting to categorize this topic as part of 2-4 word taxonomy label that encapsulates all the keywords.
      Based on the above information, can you give a short taxonomy label of the topic? Just return the taxonomy label itself.
      """
    client = openai.OpenAI(api_key=openai.api_key)
    representation_model = OpenAI(client, model="gpt-3.5-turbo", prompt=prompt,chat=True)
    # Initialize BERTopic
    topic_model = BERTopic(representation_model=representation_model,calculate_probabilities=True)

    # Fit BERTopic
    topics, probabilities = topic_model.fit_transform(keywords, embeddings)
    df['topic'] = topics  # Adding topic numbers to the DataFrame

    # Visualize the topics with t-SNE
    print("Reducing dimensions for visualization...")
    tsne = TSNE(n_components=2, random_state=42, metric='euclidean')
    reduced_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 8))
    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topics, cmap='viridis', s=50, alpha=0.6)
    plt.colorbar()
    plt.title('Content Topics Visualization with t-SNE')
    plt.xlabel('t-SNE Feature 1')
    plt.ylabel('t-SNE Feature 2')
    plt.show()

    # Probability distribution visualization
    min_probability = 0.01
    if any(probabilities[0] > min_probability):
        print("Visualizing topic probabilities...")
        fig = topic_model.visualize_distribution(probabilities[0], min_probability=min_probability)
        fig.show()
    else:
        print("No topic probabilities above the threshold to visualize.")

    # Intertopic distance map
    print("Visualizing intertopic distance map...")
    fig = topic_model.visualize_topics()
    fig.show()

    # Hierarchical clustering
    print("Visualizing hierarchical clustering...")
    fig = topic_model.visualize_hierarchy()
    fig.show()

    # Extract and name topics
    df['topic_name'] = df['topic'].apply(lambda x: topic_model.get_topic(x)[0][0] if topic_model.get_topic(x) else 'No dominant topic')

    # Display DataFrame with topic names
    display(df)

    # Export the DataFrame with topic labels
    df.to_excel('content-clusters-bertopic.xlsx', index=False)

pageContentDf = read_file('ipr-content.xlsx', 'Excel')

contentEmbeddingsDf = siteDf.merge(pageContentDf, on='Address', how='inner')
#print(contentEmbeddingsDf)

cluster_and_visualize_content(contentEmbeddingsDf, 'OpenAI Embeddings 1ConvertedFloats')