In [115]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

from langchain import LLMChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import ConversationChain
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings 
import codecs
import nltk
import re

In [116]:
#API key congfiguration
# create .env file and give api key: OPENAI_API_KEY="sk-"
from decouple import config
import os 
%load_ext dotenv
%dotenv
openai_api_key=os.getenv("OPENAI_API_KEY")

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [117]:
from langchain.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path="data/jobs_data.csv",encoding="utf8")
docs = loader.load()

In [118]:
import tiktoken
tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
    tokens=tokenizer.encode(
    text,
    disallowed_special=()
    )
    return len(tokens)

In [122]:
tiktoken.encoding_for_model('gpt-3.5-turbo')
token_counts=[tiktoken_len(doc.page_content) for doc in docs]

In [123]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=['\n\n','\n',' ','']
)

In [124]:
docs=text_splitter.split_documents(docs)
print(len(docs))

25


In [125]:
embeddings=OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

vectorstore=FAISS.from_documents(docs,embeddings)

In [126]:
llm=ChatOpenAI(
    model_name="gpt-3.5-turbo", 
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0,
    
)

In [129]:
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(search_kwargs={'k':150}), input_key="question")


In [130]:
chat_history = []
while True:
    # this prints to the terminal, and waits to accept an input from the user
    query = input('Prompt: ')
    response = chain({"question":query})
    print(response)
    # give us a way to exit the script
    if query == "exit" or query == "quit" or query == "q":
        print('Exiting')
        sys.exit()
    result = chain({'question': query, 'chat_history': chat_history})
    print('Answer: ' + response['result'])
    chat_history.append((query, response['result']))

Prompt: what are different job categories in given data?
{'question': 'what are different job categories in given data?', 'result': 'Based on the given data, the different job categories are:\n\n1. Analyst\n2. Sales\n3. Consultant (associate or analyst)\n4. Manager\n5. HR\n6. SDE (Software Development Engineer)\n7. Customer Support\n8. Engineering\n9. Entry/Intern\n10. Customer Success/Support'}
Answer: Based on the given data, the different job categories are:

1. Analyst
2. Sales
3. Consultant (associate or analyst)
4. Manager
5. HR
6. SDE (Software Development Engineer)
7. Customer Support
8. Engineering
9. Entry/Intern
10. Customer Success/Support


KeyboardInterrupt: Interrupted by user

In [None]:
# what are different job categories in given data?