In [34]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.agents import create_csv_agent
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.tools import BaseTool
from langchain.llms import OpenAI
from langchain import LLMMathChain, SerpAPIWrapper
from pathlib import Path
import openai
import pandas as pd
import os

In [35]:
OPENAI_API_KEY = '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'
openai.api_key = OPENAI_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [97]:
# Helper methods
def askChatGpt(prompt):
    messages=[{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
    model="gpt-4",
    temperature=0, # was initially 1.2
    messages = messages)
    return response.choices[0].message.content

def createTextChain(file_path):
    # Splitting the text data into chunks and creating OpenAIEmbeddings with it
    llm = OpenAI(temperature=0)
    loader = TextLoader(file_path)
    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    embeddings = OpenAIEmbeddings()

    # Creating a vector store to use as the index
    docsearch = Chroma.from_documents(texts, embeddings, collection_name='example')
    text_file_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
    return text_file_chain

def createCsvAgent(file_path):
    llm = OpenAI(temperature=0)
    return create_csv_agent(OpenAI(temperature=0), file_path, verbose=True)

def generateDescription(file_path, file_type, file_name):
    with open(file_path, 'r') as file:
#         data = file.read().replace('\n', '')
        data = file.read()
        prompt='Here are the contents of a ' + file_type + ' file, named "' + file_name +  '": \n' \
             + data + "\nGive me a description of this file in the following format: 'useful for when you need to answer questions about ___. Input should be a fully formed question.'"
        response = askChatGpt(prompt)
        return response

def convertToCsv(file_path):
    return_file_name = file_path.split('.')[0] + '.csv'
    text_file = open(file_path, 'r')
    data = text_file.read()
    prompt = 'Convert this file to a csv: \n' + data
    response = askChatGpt(prompt)
    text_file.close()

    text_file = open("output.csv", "w")
    text_file.write(response)
    text_file.close()

    dataframe1 = pd.read_csv("output.csv")
    dataframe1.to_csv(return_file_name, 
                  index = None)
    return return_file_name

def createTool(file_obj):
    file_path = file_obj['path']
    file_type = 'txt' if 'txt' in file_path else 'csv'

    if file_type == 'txt' and file_obj['structured']:
        csv_file_path = file_path.replace('txt', 'csv')
        csv_file = Path(csv_file_path)
        # if corresponding csv file doesnt already exist for this structured txt file, create one
        if not csv_file.is_file():
            file_path = convertToCsv(file_path)
        else:
            file_path = csv_file_path
        file_type = 'csv'
    
    if file_type == 'csv':
        print('creating csv agent: ', file_path)
        agent = createCsvAgent(file_path)
    else:
        agent = createTextChain(file_path)
#     if file_obj.name == 'Weekly Schedule':
#         file_description = ''
#     else:
    file_description = generateDescription(file_path, file_type, file_obj['name'])
    print(file_description)
    tool = Tool(
        name = file_obj['name'],
        func = agent.run,
#         return_direct=True,
        description = file_description
    )
    return tool

In [98]:
# Enter details for the 2 text files and 2 csv files
# structured is set true when the contents of the file are structured and can be converted to a tabular form
file_objects = [
    {
        'path': 'data/Syllabus.txt',
        'name': 'Syllabus',
        'structured': True
    },
    {
        'path': 'data/Weekly Schedule.txt',
        'name': 'Weekly Schedule',
        'structured': True
    },
    {
        'path': 'data/Grades.csv',
        'name': 'Grades',
        'structured': True
    },
    {
        'path': 'data/Professors.csv',
        'name': 'Professors',
        'structured': True
    }
]

In [99]:
tools = []

for file_obj in file_objects:
    tool = createTool(file_obj)
    tools.append(tool)
# tools.append(createTool(file_objects[1]))

creating csv agent:  data/Syllabus.csv
This file is useful for when you need to answer questions about the 10th grade syllabus, including subjects like Chemistry, Mathematics, Physical Education, Foreign Language, Art, Economics, Physics, and English Language Arts.
creating csv agent:  data/Weekly Schedule.csv
This file is useful for when you need to answer questions about the weekly schedule of classes, including the day, period, subject, and teacher for each class.
creating csv agent:  data/Grades.csv
This file is useful for when you need to answer questions about students' performance in various subjects, such as Chemistry, Mathematics, Physical Education, Foreign Language, Art, Economics, Physics, and English Language Arts.
creating csv agent:  data/Professors.csv
This file is useful for when you need to answer questions about professors' names, ages, genders, salaries, education levels, and years of service.


In [100]:
llm = OpenAI(temperature=0)
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)

In [101]:
agent.run("What is the syllabus for the subject taught in 3rd period on Monday?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find the syllabus for the subject taught in 3rd period on Monday.
Action: Syllabus
Action Input: 3rd period on Monday[0m

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: What is the subject of the 3rd period on Monday?
Action: python_repl_ast
Action Input: df.loc[2, 'Subject'][0m
Observation: [36;1m[1;3mPhysical Education[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: Physical Education[0m

[1m> Finished chain.[0m

Observation: [36;1m[1;3mPhysical Education[0m
Thought:[32;1m[1;3m I now know the syllabus for the subject taught in 3rd period on Monday.
Final Answer: The syllabus for the subject taught in 3rd period on Monday is Physical Education.[0m

[1m> Finished chain.[0m


'The syllabus for the subject taught in 3rd period on Monday is Physical Education.'

In [102]:
agent.run("What are the names of all the male Mathematics professors?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find out the names of all the male Mathematics professors.
Action: Professors
Action Input: Mathematics[0m

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to use the dataframe to answer a math question
Action: python_repl_ast
Action Input: df['salary'].mean()[0m
Observation: [36;1m[1;3m70900.0[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: The average salary of the people in the dataframe is 70900.0.[0m

[1m> Finished chain.[0m

Observation: [31;1m[1;3mThe average salary of the people in the dataframe is 70900.0.[0m
Thought:[32;1m[1;3m I need to filter the dataframe to only include male Mathematics professors.
Action: Professors
Action Input: Mathematics, Gender = Male[0m

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to filter the dataframe to only include males
Action: python_repl_ast
Action Input: df[df['gender'] == 'Male'

'The names of all the male Mathematics professors are: John Smith, David Johnson, and Mark Williams.'

In [63]:
agent.run("What is the average age of professors who teach Mathematics in the weekly schedule?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find out the age of the professors who teach Mathematics.
Action: Professors
Action Input: Mathematics[0m

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to use the dataframe to answer a math question
Action: python_repl_ast
Action Input: df['salary'].mean()[0m
Observation: [36;1m[1;3m70900.0[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: The average salary of the people in the dataframe is 70900.0.[0m

[1m> Finished chain.[0m

Observation: [31;1m[1;3mThe average salary of the people in the dataframe is 70900.0.[0m
Thought:[32;1m[1;3m I need to find out the age of the professors who teach Mathematics.
Action: Professors
Action Input: Mathematics[0m

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to use the dataframe to answer a math question
Action: python_repl_ast
Action Input: df['salary'].mean()[0m
Observation: [36;1m[1;

'The average age of professors who teach Mathematics in the weekly schedule is 45.'

In [103]:
agent.run("What days and periods is the subject with the lowest average grade taught?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find the subject with the lowest average grade and then find out what days and periods it is taught.
Action: Grades
Action Input: Average grades for each subject[0m

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to calculate the average grade for each subject
Action: python_repl_ast
Action Input: df.mean()[0m
Observation: [36;1m[1;3mCould not convert ['Student 1Student 2Student 3Student 4Student 5Student 6Student 7Student 8Student 9Student 10'] to numeric[0m
Thought:[32;1m[1;3m I need to exclude the student column
Action: python_repl_ast
Action Input: df.drop('Student', axis=1).mean()[0m
Observation: [36;1m[1;3mChemistry                86.7
Mathematics              88.0
Physical Education       81.5
Foreign Language         88.0
Art                      91.9
Economics                85.6
Physics                  85.1
English Language Arts    78.3
dtype: float64[0m
Thought:[3

'English Language Arts is taught on Mondays, Wednesdays, and Fridays during periods 1, 2, and 3 by Miguel Rodriguez.'

In [117]:
"What is the subject with the lowest average grade? Find its syllabus."


'What is the subject with the lowest average grade? Find its syllabus.'

In [118]:
agent.run("What is the subject with the lowest average grade? Find its syllabus.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find the subject with the lowest average grade and then find its syllabus.
Action: Grades
Action Input: What is the subject with the lowest average grade?[0m

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to calculate the average grade for each subject
Action: python_repl_ast
Action Input: df.mean()[0m
Observation: [36;1m[1;3mCould not convert ['Student 1Student 2Student 3Student 4Student 5Student 6Student 7Student 8Student 9Student 10'] to numeric[0m
Thought:[32;1m[1;3m I need to remove the 'Student' column
Action: python_repl_ast
Action Input: df.drop('Student', axis=1).mean()[0m
Observation: [36;1m[1;3mChemistry                86.7
Mathematics              88.0
Physical Education       81.5
Foreign Language         88.0
Art                      91.9
Economics                85.6
Physics                  85.1
English Language Arts    78.3
dtype: float64[0m
Thought:[32;1m[

'In 10th grade English Language Arts, students will explore a variety of literary genres and develop critical reading, writing, and analytical skills. They will analyze and interpret texts from different time periods and cultural backgrounds, and explore themes such as identity, power, and justice.'