In [None]:
!pip install --upgrade --quiet openai
!pip install --upgrade --quiet plotly

## Module Imports

In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, Birch, SpectralClustering
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import silhouette_score as sc
from sklearn.decomposition import PCA
import numpy as np
import plotly.express as px
import openai
from openai.embeddings_utils import cosine_similarity
import os
import pandas as pd

---------------------------

In [None]:
openai.api_key = 'sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

## Utility Functions

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

def cv_silhouette_scorer(estimator, X):
    estimator.fit(X)
    cluster_labels = estimator.labels_
    num_labels = len(set(cluster_labels))
    num_samples = len(X.index)
    if num_labels == 1 or num_labels == num_samples:
        return -1
    else:
        return sc(X, cluster_labels)
        
def search_docs(df, embedding_column, product_description, n=10):
   embedding = get_embedding(product_description, model='text-embedding-ada-002')
   df['similarities'] = df[embedding_column].apply(lambda x: cosine_similarity(x, embedding))
   res = df.sort_values('similarities', ascending=False).head(n)
   return res

---------------

In [None]:
# List csvs in the cwd:
csvs = [f for f in os.listdir() if f.endswith('.csv')]
df = pd.concat([ pd.read_csv(csv) for csv in csvs])

In [None]:
# De-duplicate the keywords
df = df.drop_duplicates(subset=['Keyword'])

# Note - Get the embeddings for each keyword - This code takes a while to run depending on the number of keywords:
df['embedding'] = np.nan

# For item in df['Keyword']:
for i, item in enumerate(df['Keyword']):
    df.loc[df['Keyword'] == item, 'embedding'] = str(get_embedding(item))
    print("Produced embedding for: ", i + 1, item)

df.to_csv('keywords_with_embeddings.csv', index=False)

Produced embedding for:  0 search engine optimisation
Produced embedding for:  1 search engine optimisation consultants
Produced embedding for:  2 search engine optimisation preston
Produced embedding for:  3 search engine optimisation consulting
Produced embedding for:  4 search engine optimisation consultancy
Produced embedding for:  5 search engine optimisation consultant
Produced embedding for:  6 search engine optimisation lytham
Produced embedding for:  7 search engine optimisation company
Produced embedding for:  8 search engine optimisation london
Produced embedding for:  9 targeted seo liverpool - search engine optimisation
Produced embedding for:  10 targeted seo services limited - search engine optimisation
Produced embedding for:  11 search engine optimisation companies
Produced embedding for:  12 search engine optimisation agencies
Produced embedding for:  13 search engine optimisation consultants london
Produced embedding for:  14 what is search engine optimisation
Produc

In [None]:
# UPDATE THIS AND INCLUDE YOUR KEYWORDS HERE:
df = pd.read_csv('keywords_with_embeddings.csv')
# Convert the embeddings into the correct format:
df['embedding'] = df.embedding.apply(eval).apply(np.array)

--------------------------------------------------

In [None]:
# Drop any na in embedding:
df = df.dropna(subset=['embedding'])

# Drop all of the columns apart from embedding:
df = df[['embedding']]

In [None]:
# Split the embedding into 768 columns:
df = pd.concat([df, df['embedding'].apply(pd.Series)], axis=1)
df.drop('embedding', axis=1, inplace=True)

In [None]:
# Create a list of the clustering algorithms you want to include in the grid search
clustering_algorithms = [KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN, Birch, SpectralClustering]

# Create a range of clusters from 1 - 20:
n_clusters = list(range(1, 20))

# Define the hyperparameter grids for each algorithm
param_grids = {
    KMeans: {
        'n_clusters': n_clusters,
        'max_iter': [100, 300, 500, 1000],
        'tol': [1e-4, 1e-3, 1e-2]
    },
    MiniBatchKMeans: {
        'n_clusters': n_clusters,
        'max_iter': [100, 300, 500, 1000],
        'tol': [1e-4, 1e-3, 1e-2]
    },
    AgglomerativeClustering: {
        'n_clusters': n_clusters,
        'linkage': ['ward', 'complete', 'average']
    },
    DBSCAN: {
        'eps': [0.1, 0.5, 1.0, 2.0],
        'min_samples': [2, 5, 10]
    },
    Birch: {
        'threshold': [0.1, 0.5, 1.0, 2.0],
        'branching_factor': [50, 100, 200]
    },
    SpectralClustering: {
        'n_clusters': n_clusters,
        'gamma': [0.1, 0.5, 1.0, 2.0]
    }
}

# Create a list to store the best models and scores
best_models = []
best_scores = []
all_models = []

# Iterate over the clustering algorithms
for algorithm in clustering_algorithms:
    print("Running grid search for: ", algorithm.__name__)

    # Get the hyperparameter grid for the current algorithm
    grid = param_grids[algorithm]

    # https://stackoverflow.com/questions/44636370/scikit-learn-gridsearchcv-without-cross-validation-unsupervised-learning
    cv = [(slice(None), slice(None))]
    search = RandomizedSearchCV(estimator=algorithm(), param_distributions=grid,
    n_iter=30, scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1, verbose=1)
    
    # Fit the search object to the data
    search.fit(df)

    # Add all of the models
    all_models.append(search.cv_results_)
    
    # Get the best model and score
    best_model = search.best_estimator_
    best_score = search.best_score_
    
    # Append the best model and score to the lists
    best_models.append(best_model)
    best_scores.append(best_score)

Fitting 1 folds for each of 10 candidates, totalling 10 fits
Fitting 1 folds for each of 10 candidates, totalling 10 fits
Fitting 1 folds for each of 10 candidates, totalling 10 fits
Fitting 1 folds for each of 10 candidates, totalling 10 fits
Fitting 1 folds for each of 10 candidates, totalling 10 fits




Fitting 1 folds for each of 10 candidates, totalling 10 fits




In [None]:
# Combine the best model and score lists into a dataframe:
results = pd.DataFrame({'model': best_models, 'score': best_scores})

--------------------------------------------------------

In [None]:
# Re-create the original dataframe:
csvs = [f for f in os.listdir() if f.endswith('.csv')]
final_df = pd.concat([ pd.read_csv(csv) for csv in csvs])
final_df = final_df.drop_duplicates(subset=['Keyword'])

# Apply the clusters to the original dataframe:
final_df['cluster'] = best_models[0].predict(df.values)

-----------------------------------------------------------------

## Visualising The Clusters in 2D using PCA

In [None]:
# Run PCA on the embeddings:
pca = PCA(n_components=2)
pca.fit(df.values)

# Transform the embeddings:
pca_embeddings = pca.transform(df.values)

# Create a dataframe with the PCA embeddings:
pca_df = pd.DataFrame(pca_embeddings, columns=['x', 'y'])

# Add the cluster labels to the dataframe:
pca_df['cluster'] = final_df['cluster'].tolist()
pca_df['keyword'] = final_df['Keyword'].tolist()

# Plot the clusters using plotly:
fig = px.scatter(pca_df, x='x', y='y', 
                 color='cluster',   
                 hover_data=['keyword'])
fig.show()

-----------------------------------------------------------------

## Finding Semantically Close Queries To Our Existing Embeddings

To find the most relevant documents, we calculate the cosine similarity between the embedding vectors of the query and each document. The documents with the highest scores are then returned.

In [None]:
similarity_df = pd.read_csv('keywords_with_embeddings.csv')
# Convert the embeddings into the correct format:
similarity_df['embedding'] = similarity_df.embedding.apply(eval).apply(np.array)

In [None]:
vector_recommendations = search_docs(df=similarity_df, embedding_column='embedding', product_description='seo', n=10)

In [None]:
vector_recommendations

Unnamed: 0,#,Keyword,Country,Difficulty,Volume,CPC,CPS,Parent Keyword,Last Update,SERP Features,Global volume,Traffic potential,embedding,similarities
2046,47,seo in website development,gb,60.0,100.0,,,seo in website design,2022-12-24 01:59:10,"Featured snippet,People also ask,Sitelinks",600.0,1200.0,"[0.0070015136152505875, 0.0027233855798840523,...",0.881624
2109,110,seo and website development,gb,,40.0,,,,,,200.0,,"[0.006953902076929808, -0.0012911978410556912,...",0.880518
2035,36,seo website development,gb,61.0,100.0,,,seo in website design,2022-12-21 21:03:19,"Top ads,Paid sitelinks,Featured snippet,People...",600.0,1100.0,"[0.0025481118354946375, -0.001251719193533063,...",0.878974
21,22,seo search engine optimisation,gb,96.0,200.0,25.0,,seo,2022-12-09 14:26:25,"Sitelinks,People also ask,Knowledge panel,Videos",1200.0,9600.0,"[-0.002835733350366354, 0.008205669932067394, ...",0.87827
1069,70,seo for digital marketing,gb,55.0,600.0,,,digital seo marketing,2022-12-22 08:48:00,"Top ads,Featured snippet,Thumbnail,People also...",1400.0,1200.0,"[-0.017512407153844833, 0.007726061623543501, ...",0.876107
1034,35,seo in digital marketing,gb,85.0,600.0,,,seo,2022-12-24 21:23:23,"Featured snippet,People also ask,Video preview",5000.0,4600.0,"[-0.003288090694695711, 0.0043290103785693645,...",0.875715
1677,678,seo and digital marketing,gb,57.0,100.0,20.0,,digital seo marketing,2022-12-27 20:06:20,"Top ads,Paid sitelinks,Featured snippet,People...",1300.0,1800.0,"[-0.008942040614783764, 0.0029922204557806253,...",0.872923
1019,20,seo digital marketing,gb,55.0,700.0,13.0,1.06,seo,2022-12-23 22:37:19,"Top ads,Paid sitelinks,Featured snippet,People...",6400.0,4300.0,"[-0.019887562841176987, 0.009596429765224457, ...",0.87021
946,947,seo agency search engine optimisation,gb,,,,,,,,40.0,,"[0.0017183320596814156, -0.0032378067262470722...",0.869075
20,21,search engine optimisation seo,gb,97.0,200.0,10.0,,seo,2022-12-16 21:24:51,"Thumbnail,People also ask,Knowledge panel,Site...",1300.0,9700.0,"[-0.013293679803609848, 0.0074667735025286674,...",0.863854
