[![Open in SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/jayyanar/gen-ai-labs-demos/blob/main/lab0-embedding/embedding_demo.ipynb)

In [1]:
! pip install gensim --quiet

zsh:1: command not found: pip


In [2]:
! pip install altair cohere pandas numpy vega --quiet

zsh:1: command not found: pip


# Example Simple Word--quiet -- for Tokenize and apply Vectorizing using Word2Vec

In [3]:
from gensim.models import Word2Vec
from gensim.utils import tokenize

# Prepare a list of sentences (corpus)
corpus = [
    "cat",
    "dog",
    "fruit",
    "banana"
]

# Preprocess the corpus by tokenizing the sentences
tokenized_corpus = [list(tokenize(sentence)) for sentence in corpus]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get the vector representation of a word
word = "dog"
word_vector = model.wv[word]
print(f"Vector representation of '{word}':\n{word_vector}\n")

# Find similar words based on cosine similarity
similar_words = model.wv.most_similar(word)
print(f"Similar words to '{word}':")
for sim_word, sim_score in similar_words:
    print(f"{sim_word}: {sim_score}")

# Check the vocabulary size
vocabulary_size = len(model.wv)
print("\nVocabulary size:", vocabulary_size)

# Get the entire word vocabulary
vocabulary = model.wv.key_to_index
print("\nWord Vocabulary:")
for word in vocabulary:
    print(word)

ModuleNotFoundError: No module named 'gensim'

## Example Sentence - Tokenize and Vectorize

In [None]:
from gensim.models import Word2Vec
from gensim.utils import tokenize

# Prepare a list of sentences (corpus)
corpus = [
    "I love playing football",
    "Football is my favorite sport",
    "I enjoy watching football matches",
    "Soccer is popular worldwide"
]

# Preprocess the corpus by tokenizing the sentences
tokenized_corpus = [list(tokenize(sentence)) for sentence in corpus]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get the vector representation of a word
word = "football"
word_vector = model.wv[word]
print(f"Vector representation of '{word}':\n{word_vector}\n")

# Find similar words based on cosine similarity
similar_words = model.wv.most_similar(word)
print(f"Similar words to '{word}':")
for sim_word, sim_score in similar_words:
    print(f"{sim_word}: {sim_score}")

# Check the vocabulary size
vocabulary_size = len(model.wv)
print("\nVocabulary size:", vocabulary_size)

# Get the entire word vocabulary
vocabulary = model.wv.key_to_index
print("\nWord Vocabulary:")
for word in vocabulary:
    print(word)

In [None]:
! pip install cohere altair  --quiet

In [None]:
import cohere
import pandas as pd
import numpy as np
import altair as alt

# Loading Credential from Cred.json - Ref https://github.com/jayyanar/gen-ai-labs-demos/tree/main for Prerequisites

In [None]:
import json
filepath = "cred.json"
file = open(filepath, 'r')

# Open the credentials file with json.load
credentials = json.load(file)
file.close()

# Load API Key
api_key = credentials['cohere_api_key']

In [None]:
co = cohere.Client(api_key)

## Let us try with corpus of data from CSV File

In [None]:
# Load the dataset to a dataframe
df_orig = pd.read_csv('https://raw.githubusercontent.com/cohere-ai/notebooks/main/notebooks/data/atis_intents_train.csv',names=['intent','query'])

# Take a small sample for illustration purposes
sample_classes = ['atis_airfare', 'atis_airline', 'atis_ground_service']
df = df_orig.sample(frac=0.12, random_state=30)
df = df[df.intent.isin(sample_classes)]
df_orig = df_orig.drop(df.index)
df.reset_index(drop=True,inplace=True)

# Remove unnecessary column 
intents = df['intent'] #save for a later need
df.drop(columns=['intent'], inplace=True)
df.head()

In [None]:
# Get text embeddings
def get_embeddings(texts,model='embed-english-v2.0'):
  output = co.embed(
                model=model,
                texts=texts)
  return output.embeddings

In [None]:

# Embed the dataset
df['query_embeds'] = get_embeddings(df['query'].tolist())
df.head()

In [None]:
# Reduce dimensionality using PCA
from sklearn.decomposition import PCA

# Function to return the principal components
def get_pc(arr,n):
  pca = PCA(n_components=n)
  embeds_transform = pca.fit_transform(arr)
  return embeds_transform

In [None]:
# Reduce embeddings to 10 principal components to aid visualization
embeds = np.array(df['query_embeds'].tolist())
embeds_pc = get_pc(embeds,10)

In [None]:
# Calculate cosine similarity between the search query and existing queries

from sklearn.metrics.pairwise import cosine_similarity

def get_similarity(target,candidates):
  # Turn list into array
  candidates = np.array(candidates)
  target = np.expand_dims(np.array(target),axis=0)

  # Calculate cosine similarity
  sim = cosine_similarity(target,candidates)
  sim = np.squeeze(sim).tolist()
  sort_index = np.argsort(sim)[::-1]
  sort_score = [sim[i] for i in sort_index]
  similarity_scores = zip(sort_index,sort_score)

  # Return similarity scores
  return similarity_scores

In [None]:
# Add new query
new_query = "show business fares"

# Get embeddings of the new query
new_query_embeds = get_embeddings([new_query])[0]

In [None]:
# Get the similarity between the search query and existing queries
similarity = get_similarity(new_query_embeds,embeds[:sample])

# View the top 5 articles
print('Query:')
print(new_query,'\n')

print('Similar queries:')
for idx,sim in similarity:
  print(f'Similarity: {sim:.2f};',df.iloc[idx]['query'])

In [None]:
# Create new dataframe and append new query
df_sem = df.copy()
df_sem.loc[len(df_sem.index)] = [new_query, new_query_embeds]

# Reduce embeddings dimension to 2
embeds_sem = np.array(df_sem['query_embeds'].tolist())
embeds_sem_pc2 = get_pc(embeds_sem,2)

# Add the principal components to dataframe
df_sem_pc2 = pd.concat([df_sem, pd.DataFrame(embeds_sem_pc2)], axis=1)

In [None]:
# Create column for representing chart legend
df_sem_pc2['Source'] = 'Existing'
df_sem_pc2.at[len(df_sem_pc2)-1, 'Source'] = "New"

# Plot on a chart
df_sem_pc2.columns = df_sem_pc2.columns.astype(str)
selection = list(range(sample)) + [-1]
generate_chart(df_sem_pc2.iloc[selection],'0','1',color='Source',title='Semantic Search')

In [None]:
from sklearn.cluster import KMeans

# Pick the number of clusters
df_clust = df_pc2.copy()
n_clusters=2

# Cluster the embeddings
kmeans_model = KMeans(n_clusters=n_clusters, random_state=0)
classes = kmeans_model.fit_predict(embeds).tolist()
df_clust['cluster'] = (list(map(str,classes)))

# Plot on a chart
df_clust.columns = df_clust.columns.astype(str)
generate_chart(df_clust.iloc[:sample],'0','1',lbl='on',color='cluster',title='Clustering with 2 Clusters')