In [73]:
# Last amended: 25th May, 2024
# Run on jupyter notebook not colab. 
# laptop has ollama installed in WSL

Reference this [article](https://medium.com/data-science-in-your-pocket/recommendation-systems-using-langchain-and-llms-with-codes-d3c4c4e66732)

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Generate user_id and item_id data

In [41]:
# 1.0 Call libraries
import numpy as np
import pandas as pd

In [None]:
# 1.1 Define constants
#     Define the number of users and unique items
num_users = 1000
num_items = 20

In [42]:
# 1.2 Generate random user IDs and item IDs
user_ids = np.arange(1, num_users + 1)
item_ids = np.arange(1, num_items + 1)
user_ids[:10]
item_ids[:10]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [44]:
# 1.3 Create random interaction data
data = {
    # Select randomly from user_ids, num_users * 10 values
    'user_id': np.random.choice(user_ids, size=num_users * 10),
    # Select randomly from item_ids, num_users * 10 values
    'item_id': np.random.choice(item_ids, size=num_users * 10),
}
data

{'user_id': array([713, 623, 554, ..., 772, 568, 732]),
 'item_id': array([ 6,  2,  7, ..., 10,  4, 19])}

In [49]:
# 1.4 Transform to pandas DataFrame:
df = pd.DataFrame(data).drop_duplicates()

# 1.5 Display the first few rows of the generated data
df.head()

Unnamed: 0,user_id,item_id
0,713,6
1,623,2
2,554,7
3,996,10
4,620,8


In [50]:
# 1.6 Grouping all interactions by a user as list
df = df.groupby(['user_id'])['item_id'].agg(list).reset_index()
df.head()

Unnamed: 0,user_id,item_id
0,1,"[2, 10, 7, 14, 18]"
1,2,"[20, 8, 10, 6]"
2,3,"[11, 10, 3, 4, 18]"
3,4,"[13, 12, 20, 17]"
4,5,"[13, 8, 2, 18, 4, 3, 11, 10]"


In [51]:
# 1.7 Creating OHE 
df['item_id'] = df['item_id'].transform(lambda x: [0 if y+1 not in x else y+1 for y in range(20)])
df.head()

Unnamed: 0,user_id,item_id
0,1,"[0, 2, 0, 0, 0, 0, 7, 0, 0, 10, 0, 0, 0, 14, 0..."
1,2,"[0, 0, 0, 0, 0, 6, 0, 8, 0, 10, 0, 0, 0, 0, 0,..."
2,3,"[0, 0, 3, 4, 0, 0, 0, 0, 0, 10, 11, 0, 0, 0, 0..."
3,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 0..."
4,5,"[0, 2, 3, 4, 0, 0, 0, 8, 0, 10, 11, 0, 13, 0, ..."


In [52]:
# 1.8 save csv
df.to_csv('dummy_data.csv',index=False)

In [53]:
# 2.0
from langchain_community.llms import Ollama
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS


In [54]:
# 2.1
llm = Ollama(model="llama3")
llm

Ollama(model='llama3')

In [9]:
# 2.2 For pritning markdown text
#     StackOverflow: https://stackoverflow.com/a/32035217
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [8]:
# 2.3
output = llm.invoke("how can langsmith help with testing?")
printmd(output)

Langsmith, as a language model, can be used in various ways to assist with testing:

1. **Automated Testing**: Langsmith can generate test data and scenarios for automated testing frameworks like Selenium or Cypress. This saves developers time and reduces the need for manual testing.
2. **Test Data Generation**: Langsmith can create realistic test data (e.g., user input, product descriptions) to help populate test cases. This ensures that tests cover a wide range of scenarios and edge cases.
3. **Natural Language Processing (NLP) Testing**: Langsmith's NLP capabilities can be used to test the accuracy of natural language processing models, such as text classification, sentiment analysis, or named entity recognition.
4. **Error Message Generation**: Langsmith can generate error messages that mimic real-world scenarios, helping developers test error handling and recovery mechanisms in their code.
5. **Test Case Development**: Langsmith's ability to generate text based on prompts can be used to develop test cases for software applications. For example, generating test cases for a chatbot or virtual assistant.
6. **Code Review**: Langsmith can help with code review by analyzing code snippets and providing feedback on syntax, semantics, and best practices, helping developers catch errors and improve their code quality.

To get started with using Langsmith for testing, you can:

* Use the Langsmith API to integrate it with your testing framework or script.
* Generate test data or scenarios using the Langsmith web interface or command-line tool.
* Train a custom Langsmith model for specific testing purposes (e.g., generating error messages or test cases).

By leveraging Langsmith's language processing capabilities, you can streamline your testing process, reduce errors, and improve code quality.

In [56]:
# 2.4 Data loader
loader = CSVLoader(file_path="dummy_data.csv")
data = loader.load()
data[:5]

[Document(page_content='user_id: 1\nitem_id: [0, 2, 0, 0, 0, 0, 7, 0, 0, 10, 0, 0, 0, 14, 0, 0, 0, 18, 0, 0]', metadata={'source': 'dummy_data.csv', 'row': 0}),
 Document(page_content='user_id: 2\nitem_id: [0, 0, 0, 0, 0, 6, 0, 8, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20]', metadata={'source': 'dummy_data.csv', 'row': 1}),
 Document(page_content='user_id: 3\nitem_id: [0, 0, 3, 4, 0, 0, 0, 0, 0, 10, 11, 0, 0, 0, 0, 0, 0, 18, 0, 0]', metadata={'source': 'dummy_data.csv', 'row': 2}),
 Document(page_content='user_id: 4\nitem_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 0, 0, 17, 0, 0, 20]', metadata={'source': 'dummy_data.csv', 'row': 3}),
 Document(page_content='user_id: 5\nitem_id: [0, 2, 3, 4, 0, 0, 0, 8, 0, 10, 11, 0, 13, 0, 0, 0, 0, 18, 0, 0]', metadata={'source': 'dummy_data.csv', 'row': 4})]

In [63]:
# 2.5 Data transformers
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(data)
texts[:10]

[Document(page_content='user_id: 1\nitem_id: [0, 2, 0, 0, 0, 0, 7, 0, 0, 10, 0, 0, 0, 14, 0, 0, 0, 18, 0, 0]', metadata={'source': 'dummy_data.csv', 'row': 0}),
 Document(page_content='user_id: 2\nitem_id: [0, 0, 0, 0, 0, 6, 0, 8, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20]', metadata={'source': 'dummy_data.csv', 'row': 1}),
 Document(page_content='user_id: 3\nitem_id: [0, 0, 3, 4, 0, 0, 0, 0, 0, 10, 11, 0, 0, 0, 0, 0, 0, 18, 0, 0]', metadata={'source': 'dummy_data.csv', 'row': 2}),
 Document(page_content='user_id: 4\nitem_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 0, 0, 17, 0, 0, 20]', metadata={'source': 'dummy_data.csv', 'row': 3}),
 Document(page_content='user_id: 5\nitem_id: [0, 2, 3, 4, 0, 0, 0, 8, 0, 10, 11, 0, 13, 0, 0, 0, 0, 18, 0, 0]', metadata={'source': 'dummy_data.csv', 'row': 4}),
 Document(page_content='user_id: 6\nitem_id: [1, 0, 3, 0, 5, 0, 0, 8, 9, 0, 0, 0, 13, 0, 0, 0, 17, 18, 0, 20]', metadata={'source': 'dummy_data.csv', 'row': 5}),
 Document(page_content='user_id:

In [64]:
# 2.6 Embeddings model, this can be a local LLM as well
embeddings = OllamaEmbeddings(model = 'llama3')
# 2.7 Vector DB
docsearch = FAISS.from_documents(texts, embeddings)

### Summarization
> Summarization is a critical aspect of natural language processing (NLP), enabling the condensation of large volumes of text into concise summaries. LangChain, a powerful tool in the NLP domain, offers three distinct summarization techniques: `stuff`, `map_reduce`, and `refine`. Each method has its unique advantages and limitations, making them suitable for different scenarios. 

### chain types   
See Summarization with LangChain [here](https://medium.com/@abonia/summarization-with-langchain-b3d83c030889)


1. Stuff Chain

> The stuff chain is particularly effective for handling large documents. It works by converting the document into smaller chunks, processing each chunk individually, and then combining the summaries to generate a final summary. This method is ideal for managing huge files and can be facilitated with the help of a recursive character text splitter.    

2. Map-Reduce Method

>The Map-Reduce method involves summarizing each document individually (map step) and then combining these summaries into a final summary (reduce step). This approach is more scalable and can handle larger volumes of text.The map_reduce technique is designed for summarizing large documents that exceed the token limit of the language model. It involves dividing the document into chunks, generating summaries for each chunk, and then combining these summaries to create a final summary. This method is efficient for handling large files and significantly reduces processing time.

In [66]:
# 3.0 Retriever
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=docsearch.as_retriever()
                                )

qa

RetrievalQA(combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=Ollama(model='llama3')), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4e867741d0>))

In [67]:
# 3.1 Supply prompt and invoke chain
#     Takes time

ans = qa.run('Suggest two articles to user-id 78 using given data which it has not seen.\
Follow this approach: First, find similar user-ids that is those users having items similar to user-id 78. \
Then match all their items. Sugest those articles for user-id 78.\
which are present with others but not with user-id 78. \
Also give a reason for suggestion').split('.')

In [69]:
# 3.2 print ans
print(ans)

['I don\'t know how to determine what "similar user-ids" are, as there is no obvious similarity metric or criteria given in the context', ' However, I can try to provide an answer based on some intuitive reasoning', "\n\nTo find similar user-ids, I'll look for patterns and common items across different user-ids", ' One approach is to identify the most frequently occurring items across multiple user-ids', '\n\nAfter analyzing the data, I found that items 1, 2, 3, 4, 18, and 19 are present in at least two user-ids (20, 175, 352, and 210)', ' These items could be considered similar or shared interests among users', "\n\nNow, let's find articles that are present with these similar user-ids but not with user-id 78", ' After analyzing the data again, I found that:\n\n* User-id 20 has item 13, which is not present in user-id 78', '\n* User-id 175 has item 17, which is not present in user-id 78', '\n\nBased on this analysis, I suggest the following articles to user-id 78:\n\n1', " Article rela

In [70]:
# 3.3 For properly prining markdown text:
[printmd(ans[i]) for i in range(len(ans))]

I don't know how to determine what "similar user-ids" are, as there is no obvious similarity metric or criteria given in the context

 However, I can try to provide an answer based on some intuitive reasoning



To find similar user-ids, I'll look for patterns and common items across different user-ids

 One approach is to identify the most frequently occurring items across multiple user-ids



After analyzing the data, I found that items 1, 2, 3, 4, 18, and 19 are present in at least two user-ids (20, 175, 352, and 210)

 These items could be considered similar or shared interests among users



Now, let's find articles that are present with these similar user-ids but not with user-id 78

 After analyzing the data again, I found that:

* User-id 20 has item 13, which is not present in user-id 78


* User-id 175 has item 17, which is not present in user-id 78



Based on this analysis, I suggest the following articles to user-id 78:

1

 Article related to item 13 (assuming it's a topic of interest for similar users like user-id 20)

 Reason: Since user-id 78 doesn't have item 13, recommending an article about that topic might be of interest


2

 Article related to item 17 (assuming it's a topic of interest for similar users like user-id 175)

 Reason: As with the previous suggestion, since user-id 78 doesn't have item 17, recommending an article about that topic might be of interest



Please note that this answer is based on intuitive reasoning and may not be accurate or comprehensive



[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [40]:
# 4.0 Data extracts
df.loc[df['user_id'] == 78, :]
df.loc[df['user_id'] == 10, :]
df.loc[df['user_id'] == 98, :]


Unnamed: 0,user_id,item_id
77,78,"[1, 0, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 0, 0..."


Unnamed: 0,user_id,item_id
9,10,"[1, 2, 3, 4, 0, 6, 7, 8, 9, 0, 0, 0, 13, 0, 0,..."


Unnamed: 0,user_id,item_id
97,98,"[1, 2, 0, 4, 5, 6, 7, 8, 0, 0, 0, 12, 13, 0, 1..."


In [18]:
########### DONE ####################