### Load Data

In [4]:
cols = ['user_id', 'movie_id', 'rating', 'timestamp']
train_data = pd.read_csv("ml-100k/u1.base",sep='\t', names=cols).drop(columns=['timestamp']).astype(int)
test_data = pd.read_csv("ml-100k/u1.test",sep='\t', names=cols).drop(columns=['timestamp']).astype(int)

In [5]:
n_users, n_items = 943,1682

In [29]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [19]:
class Matrixfactorization(nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = nn.Embedding( n_users,n_factors)
        self.item_factors = nn.Embedding( n_items,n_factors)
#         self.l1 = nn.Linear(n_factors*2,16)
#         self.l2 = nn.Linear(16,1)
#         self.drop = nn.Dropout()
        
    def forward(self, user, item):
        user = torch.LongTensor(user) - 1
        item = torch.LongTensor(item) - 1
        u,it = self.user_factors(user),self.item_factors(item)
#         x = torch.cat([u,it],dim=1)
#         x = F.relu(self.l1(x))
#         x = self.drop(x)
#         x = self.l2(x)
#         return x
        x = (u*it).sum(1)
        assert x.shape==user.shape
        return x * 5

In [20]:
model = Matrixfactorization(n_users,n_items)
opt = Adam(model.parameters(),lr=1e-3)
criterion = nn.L1Loss()
batch_size = 32

In [23]:
avg = []
mx = []
states = {}
model.train(True)
for e in range(20):
    for it in range(len(train_data)//batch_size):
        #---------------SETUP BATCH DATA-------------
        df = train_data.sample(frac=batch_size/len(train_data))
        users = df.user_id.values
        items = df.movie_id.values
        targets = torch.FloatTensor(df.rating.values)
        assert users.shape==(batch_size,)==items.shape
        
        #----------------TRAIN MODEL------------------------
        opt.zero_grad()
        preds = model(users,items)
        mx.append((preds.max().item(),preds.min().item()))
        loss = criterion(preds,targets)
        assert preds.shape==targets.shape
        loss.backward()
        opt.step()
        avg.append(loss.item())

#         if it%500==0:
#             print(f"Iter {it}: {sum(avg)/len(avg)}")

    print(f"Epoch {e+1}:",sum(avg)/len(avg))
    avg = []
    states[e+1] = model.state_dict()

Epoch 1: 16.061074742507934
Epoch 2: 12.612734846878052
Epoch 3: 9.837263370323182
Epoch 4: 7.819109057235718
Epoch 5: 6.2363761887550355
Epoch 6: 5.1391187036514285
Epoch 7: 4.314232008361817
Epoch 8: 3.5944116067886354
Epoch 9: 2.9559540061473846
Epoch 10: 2.3333496863365175
Epoch 11: 1.8750979673147201
Epoch 12: 1.5423662888288499
Epoch 13: 1.3099110254526138
Epoch 14: 1.143747090792656
Epoch 15: 1.0276083385705947
Epoch 16: 0.9569112715601921
Epoch 17: 0.8840741533756256
Epoch 18: 0.8514112945437431
Epoch 19: 0.8070766439795494
Epoch 20: 0.7789720355033874


In [24]:
preds.view(-1).size()

torch.Size([32])

In [25]:
with torch.no_grad():
    #model.load_state_dict(states[20])
    model.train(False)
    users = test_data.user_id.values
    items = test_data.movie_id.values
    test_data['pred'] = model(users,items).numpy()

In [26]:
mean_absolute_error(test_data.pred,test_data.rating)

1.768317251415737

In [27]:
test_data['pp'] = test_data.pred.clip(0,5).round()
print(test_data.rating.std(),test_data.pp.std(),test_data.pred.std())

1.153680085156602 1.4462609 3.137181


### LLM Recommendation

In [2]:
from datasets import load_dataset

dataset = load_dataset("xiyuez/red-dot-design-award-product-description")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['product', 'category', 'description', 'text'],
        num_rows: 21183
    })
})


In [4]:
df = dataset['train'].to_pandas()

In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')



In [7]:
# some sample sentences to convert to embeddings
sentences = [
  "This is an example sentence", 
  "Each sentence is converted into an embedding",
  "An embedding is nothing more than a large, numerical representation of the contents of a unit of text"
  ]

In [8]:
embeddings = model.encode(sentences)

In [9]:
display(embeddings)

array([[ 0.06765681,  0.06349584,  0.04871313, ...,  0.03807075,
         0.05996534, -0.04222878],
       [ 0.06446958,  0.01299917,  0.06469141, ...,  0.07068044,
         0.08054673, -0.05417692],
       [ 0.01413998, -0.03184138, -0.03253142, ...,  0.01166508,
         0.02897943, -0.02880162]], dtype=float32)

In [27]:
from llama_index.core import Document

df['combined'] = 'product: ' + df['product'] + ', category: ' + df['category'] + ', description: ' + df['description'] + ', text: ' + df['text']

documents = [
    Document(
        text=row['combined']
    )
    for _, row in df.iterrows()
]

### Weaviate Client

In [16]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [17]:
import weaviate

client = weaviate.Client(
    url=os.getenv("WEAVIATE_URL"), # Replace with your Weaviate Cloud URL
    auth_client_secret=weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_API_KEY")),  # Replace w/ your Weaviate instance API key
    additional_headers={"X-Cohere-Api-Key": os.environ['COHERE_API_KEY'],}
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [18]:
client.is_ready()

True

In [20]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

In [25]:
# construct vector store
vector_store = WeaviateVectorStore(
    weaviate_client=client, 
    index_name="LlamaIndex"
)

In [28]:
# if client.schema.exists("LlamaIndex"):
#     client.schema.delete_class("LlamaIndex")
    
# index = VectorStoreIndex(
#     documents,
#     storage_context = storage_context,
# )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
query_engine = index.as_query_engine()
response = query_engine.query("Can you recommend Mobile Computer?")

In [41]:
response.response

'I recommend the MC33 mobile computer for hard-working environments such as retail, warehouse management, and manufacturing. It offers a familiar keypad, fully functional touchscreen, and various configurations to suit different needs. The MC33 provides a high level of operating comfort with features like finger ledges for multiple grip points and an optional scan angle of 45 degrees for improved screen readability.'

In [37]:
from llama_index.core.memory import ChatMemoryBuffer

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

In [38]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational retail shopping assistant. Use the following context including product names, descriptions, and keywords to show the shopper whats available, help find what they want, and answer any questions"
        "It's ok if you don't know the answer."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous chat history, or the context above, to interact and help the user."
    ),
    verbose=False,
)

In [39]:
response = chat_engine.stream_chat("gold-plated earrings")
for token in response.response_gen:
    print(token, end="")

Hello! Are you looking for gold-plated earrings in particular? We have a beautiful option called Miyu in the Needleworks Earrings category. The Miyu earrings are available in either silver or yellow gold, and they feature a modern design with geometric embellishments. Would you like more information about the Miyu earrings or help with anything else?

In [None]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
system_prompt = 'You are an AI assistant functioning as a recommendation system for an ecommerce website. Be specific and limit your answers to the requested format. Keep your answers short and concise.'

In [None]:
def get_user_prompt(ordered_list_of_items):

   # assemble user prompt
  prompt = None
  if len(ordered_list_of_items) > 0:
    items = ', '.join(ordered_list_of_items)
    prompt =  f"A user bought the following items: {items}. What next ten items would he/she be likely to purchase next?"
    prompt += " Express your response as a JSON object with a key of 'next_items' and a value representing your array of recommended items."
 
  return prompt

In [None]:
# get prompt and results
user_prompt = get_user_prompt(
    ['scarf', 'beanie', 'ear muffs', 'thermal underwear']
    )

print(user_prompt)

In [None]:
response = ChatCompletion.create(
  model='llama-2-70b-chat',
  messages=[
    {'role': 'system', 'content': system_prompt},
    {'role': 'user','content': user_prompt}
    ],
  max_tokens=128
  )
print(f'response.message:{response.message}')

In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding 
from llama_index.core.prompts import PromptTemplate
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import VectorStoreIndex, ServiceContext, download_loader
import accelerate
from llama_index.core.memory import ChatMemoryBuffer

In [7]:
MAX_TEXT_LENGTH=1024 

In [8]:
def auto_truncate(val):
    """Truncate the given text."""
    return val[:MAX_TEXT_LENGTH]

In [None]:
all_prods_df = pd.read_csv("product_data.csv", converters={
    'bullet_point': auto_truncate,
    'item_keywords': auto_truncate,
    'item_name': auto_truncate,
    'material': auto_truncate
})

In [2]:
product_df = pd.read_csv("dataset/product.csv", sep='\t')
product_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/product.csv'