In [None]:
!pip install pandas openai tiktoken langchain scikit-learn

Collecting openai
  Downloading openai-1.16.2-py3-none-any.whl (267 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.1.15-py3-none-any.whl (814 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m814.5/814.5 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0

In [None]:
from google.colab import userdata

# Let's make sure LLMPROXY_JWT is defined.
assert userdata.get("LLMPROXY_JWT")

In [None]:
import os
os.environ['OPENAI_API_KEY'] = f"{userdata.get('LLMPROXY_JWT')}:llmproxy-workshop"
os.environ['OPENAI_API_BASE'] = 'https://gramener.com/llmproxy/v1/'

In [None]:
!pip install langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.1.2-py3-none-any.whl (33 kB)
Installing collected packages: langchain-openai
Successfully installed langchain-openai-0.1.2


In [55]:
import numpy as np
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings.cache import CacheBackedEmbeddings
from langchain.storage.file_system import LocalFileStore

base = OpenAIEmbeddings()
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-large")


cached_embeddings = CacheBackedEmbeddings.from_bytes_store(base, LocalFileStore('.embeddings/'), namespace=embeddings_model)

In [56]:
docs = ["Apple", "Banana", "Friendship", "Love", "Hate", "Australia", "Japan"]

In [57]:
#cached_embeddings.embed_documents(docs)
type(cached_embeddings)

In [58]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_community.embeddings import OpenAIEmbeddings

store = LocalFileStore('./my_cache')

base = OpenAIEmbeddings(model="text-embedding-3-large")
embedder = CacheBackedEmbeddings.from_bytes_store(base, store, namespace= base.model)

# Embedding is computed and cached
check = embedder.embed_documents(["hello", "goodbye"])


In [59]:
import numpy as np

np.array(check).shape

(2, 3072)

In [60]:
docs_embed =  np.array(embedder.embed_documents(docs))
docs_embed.shape

(7, 3072)

In [61]:
print(docs_embed)


[[-3.33855659e-02  2.17133620e-02  8.66097432e-05 ... -3.17715057e-02
  -2.48905130e-02 -4.15408220e-03]
 [-1.20679058e-02 -1.67659728e-02 -9.91087977e-03 ... -1.43148072e-02
  -1.33915349e-02  3.79113624e-03]
 [-8.25633109e-03  1.34873129e-02  7.60034600e-04 ... -1.43239321e-02
  -3.37605365e-03  2.46971636e-03]
 ...
 [-3.79652649e-02 -2.82782508e-02  2.91099637e-03 ... -4.88019976e-03
   7.66073135e-03  7.30195329e-03]
 [ 1.22548710e-03 -8.18690697e-04  2.87255419e-04 ...  2.67731214e-03
  -2.77314895e-02  2.06681156e-02]
 [ 1.81074755e-02 -1.30573603e-02 -5.26813365e-03 ... -4.89948312e-03
   4.52290470e-03  1.10297835e-03]]


In [67]:
# We can classify these into countries, fruits, concepts, etc. by finding the similarity
topics = ["fruit", "emotion", "country", "company"]
topics_embed = np.array(embedder.embed_documents(topics))
np.dot(docs_embed, topics_embed.T)

#Given docs
#docs = ["Apple", "Banana", "Friendship", "Love", "Hate", "Australia", "Japan"]


array([[0.46873727, 0.24370414, 0.25654486, 0.40024876],
       [0.43734335, 0.17945705, 0.19750082, 0.1975892 ],
       [0.25555425, 0.29594017, 0.21358255, 0.21443824],
       [0.26847784, 0.3861965 , 0.23903684, 0.2783195 ],
       [0.2105847 , 0.27664136, 0.15870953, 0.14741642],
       [0.21283657, 0.16662015, 0.41211213, 0.2612594 ],
       [0.21657784, 0.2160761 , 0.37314938, 0.23280266]])

In [69]:
# Let's cluster these
import pandas as pd
from sklearn.cluster import BisectingKMeans

cluster_model = BisectingKMeans(n_clusters=4, init='random')
cluster_model.fit(docs_embed)
pd.Series(docs).groupby(cluster_model.labels_).apply(list).tolist()

# You'll see that clusters are fairly intuitive
# but Apple is sometimes grouped with the countries and sometimes with fruits
# and Banana is sometimes grouped with emotions
# and Love and Hate are sometimes joined, sometimes separate

[['Banana'], ['Friendship', 'Love', 'Hate'], ['Japan'], ['Apple', 'Australia']]

In [70]:
# Let's run this on hotel reviews
import pandas as pd

data = pd.read_excel("/content/drive/MyDrive/GenAI/Kaggle_Datafiniti_Hotel_Reviews.xlsx")
data.head(3).T

Unnamed: 0,0,1,2
id,AVwc252WIN2L1WUfpqLP,AVwc252WIN2L1WUfpqLP,AVwc252WIN2L1WUfpqLP
dateAdded,2016-10-30T21:42:42Z,2016-10-30T21:42:42Z,2016-10-30T21:42:42Z
dateUpdated,2018-09-10T21:06:27Z,2018-09-10T21:06:27Z,2018-09-10T21:06:27Z
address,5921 Valencia Cir,5921 Valencia Cir,5921 Valencia Cir
categories,"Hotels,Hotels and motels,Hotel and motel reser...","Hotels,Hotels and motels,Hotel and motel reser...","Hotels,Hotels and motels,Hotel and motel reser..."
primaryCategories,Accommodation & Food Services,Accommodation & Food Services,Accommodation & Food Services
city,Rancho Santa Fe,Rancho Santa Fe,Rancho Santa Fe
country,US,US,US
keys,us/ca/ranchosantafe/5921valenciacir/359754519,us/ca/ranchosantafe/5921valenciacir/359754519,us/ca/ranchosantafe/5921valenciacir/359754519
latitude,32.990959,32.990959,32.990959


In [71]:
 # Pick one state to analyze
df = data[data['province'] == 'LA']
len(df)

239

In [74]:
# Let's use the review title and text as the documents
docs = (df["reviews.title"] + ": " + df["reviews.text"]).tolist()
docs_embed = np.array(embedder.embed_documents(docs))
docs_embed.shape

(239, 3072)

In [75]:
# Let's train a model to predict the rating based on the review text
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(docs_embed, df['reviews.rating'], test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=5)
model.fit(X_train, y_train)

In [76]:
# Let's see how good the model's predictions are
from sklearn.metrics import mean_absolute_error

preds = model.predict(X_test)
mean_absolute_error(y_test, preds)
# It gets it right to ±0.65, which is about half a star away

0.7041666666666666

In [77]:
# Let's cluster them
from sklearn.metrics import silhouette_score, silhouette_samples

def cluster(docs, **kwargs):
    cluster_model = BisectingKMeans(**kwargs)
    docs_embed = np.array(embedder.embed_documents(docs))
    cluster_model.fit(docs_embed)
    return pd.DataFrame({
        "doc": docs,
        # Label for each cluster
        "cluster": cluster_model.labels_,
        # How close a doc is to its cluster vs its nearest cluster
        "scores": silhouette_samples(docs_embed, cluster_model.labels_),
    })

In [78]:
# Let's cluster and see the 5 more representative comments about each cluster
result = cluster(docs)
result.sort_values('scores', ascending=False).groupby('cluster').apply(lambda x: x.head(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,doc,cluster,scores
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,79,"Older hotel, but a quality experience: The loc...",0,0.04256
0,54,"2-day stay: This is a beautiful, clean, well-m...",0,0.0393
0,63,I would return- to the 1st floor!: Stay downst...,0,0.033109
1,152,A respite from the bustle of Bourbon Street: W...,1,0.132588
1,137,Great Place in the Quarter: The Omni Royal Orl...,1,0.127539
1,163,Walk ANYWHERE in the French Quarter from this ...,1,0.124744
2,128,Newer hotel offers comfy beds and spacious roo...,2,0.055709
2,127,Mixed reviews: The service was pretty awful. C...,2,0.051442
2,87,"OH. HELL. NO.: Unacceptable, unprofessional be...",2,0.050114
3,8,Aloft Nola: We had a wonderful time. Aloft was...,3,0.08146


In [80]:
# Let's group reviews into topics
# topics = ["Location", "Staff and service", "Cleanliness", "Amenities", "Food", "Room comfort", "Price/value", "Noise"]
topics = ["Location", "Check-in Experience", "Cleanliness", "In-Room Technology", "Food", "Room Views", "Price/value", "Child-Friendly Amenities"]
topics_embed = np.array(embedder.embed_documents(topics))
# Find the similarity between each review and each topic
similarity = np.dot(docs_embed, topics_embed.T)
# Find the topic MOST similar to each document
result['topic'] = np.array(topics)[np.argmax(similarity, axis=1)]
result.groupby('topic').apply(lambda x: x.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,doc,cluster,scores,topic
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Check-in Experience,1,What a great experience!: Great hotel. So much...,7,-0.028041,Check-in Experience
Check-in Experience,5,Southern Hospitality at its core: I am current...,3,0.038674,Check-in Experience
Check-in Experience,6,Fabulous First: It was our first visit to NOLA...,3,0.041527,Check-in Experience
Check-in Experience,7,Good hotel front desk could improve: We stayed...,2,0.017348,Check-in Experience
Check-in Experience,10,Good time in NOLA. Check your bill before you ...,2,-0.006231,Check-in Experience
Child-Friendly Amenities,0,Hotel with Personality!: Loved this place! If ...,0,0.003946,Child-Friendly Amenities
Child-Friendly Amenities,2,Great hotel!: Nice hotel with a great restaura...,7,0.022729,Child-Friendly Amenities
Child-Friendly Amenities,3,"Friendly, helpful staff and comfortable rooms!...",7,-0.008964,Child-Friendly Amenities
Child-Friendly Amenities,4,"As Local As You Can Get: The young, hip hotel ...",2,-0.026231,Child-Friendly Amenities
Child-Friendly Amenities,8,Aloft Nola: We had a wonderful time. Aloft was...,3,0.08146,Child-Friendly Amenities


In [81]:
# Let's see which docs fit least with these categories -- "outliers", so to speak
result['max_similarity'] = np.max(similarity, axis=1)
for index, row in result.sort_values('max_similarity').head(10).iterrows():
    print(row['max_similarity'], row['topic'], '--', row['doc'])

0.1769847175615158 Room Views -- Best Western DeRidder, Louisiana is a great place to stay: We stayed at Best Western in DeRidder, LA as a group with a baseball team for a tournament. It was a nice place to stay and convenient location to where we needed to go. The rooms were clean and comfortable. The staff was really helpful and nice.
0.18980737698746558 Room Views -- Wonderful Old New Orleans Hotel: We had an amazing time at the Cornstalk Hotel. Ms.Sharon was wonderful. The hotel is a lovely old New Orleans style. It's cozy and romantic. I look forward to going back for another visit. We went for our 25th Anniversary and the staff went above and beyond to make it extra special. Thank you!
0.21224483608920375 Room Views -- Closed for remodel: From all outside appearance this facility is closed for remodeling as well as restaurants. Great location. Hope it is open on my next visit to NOLA. Lots of eating places in the area. Have to do minimum of 200 characters but not sure what more I