In [1]:
import os

In [None]:
!pip install gitpython langchain openai chromadb tiktoken langchain-community

In [4]:
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

### Clone Github repositories

In [5]:
%pwd

'/content'

In [8]:
!mkdir test_repo

In [9]:
repo_path = "test_repo/"

Repo.clone_from("https://github.com/kshitijkutumbe/Visa-Sanction-Prediction-ML.git", to_path=repo_path)

<git.repo.base.Repo '/content/test_repo/.git'>

In [11]:
repo_path = "test_repo/"

loader = GenericLoader.from_filesystem(repo_path+'/us_visa_prediction/pipeline',
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [12]:
documents = loader.load()

In [13]:
documents

[Document(metadata={'source': 'test_repo/us_visa_prediction/pipeline/training_pipeline.py', 'language': <Language.PYTHON: 'python'>}, page_content='import sys\nfrom us_visa_prediction.exception import USvisaException\nfrom us_visa_prediction.logger import logging\n\nfrom us_visa_prediction.components.data_ingestion import DataIngestion\nfrom us_visa_prediction.components.data_validation import DataValidation\nfrom us_visa_prediction.components.data_transformation import DataTransformation\nfrom us_visa_prediction.components.model_trainer import ModelTrainer\nfrom us_visa_prediction.components.model_evaluation import ModelEvaluation\nfrom us_visa_prediction.components.model_pusher import ModelPusher\n\nfrom us_visa_prediction.entity.config_entity import (DataIngestionConfig,\n                                          DataValidationConfig,\n                                          DataTransformationConfig,\n                                          ModelTrainerConfig,\n                 

### Chunkings

In [14]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 2000,
                                                             chunk_overlap = 200)

In [15]:
texts = documents_splitter.split_documents(documents)

In [16]:
len(texts)

9

In [20]:
texts[0]

Document(metadata={'source': 'test_repo/us_visa_prediction/pipeline/training_pipeline.py', 'language': <Language.PYTHON: 'python'>}, page_content='import sys\nfrom us_visa_prediction.exception import USvisaException\nfrom us_visa_prediction.logger import logging\n\nfrom us_visa_prediction.components.data_ingestion import DataIngestion\nfrom us_visa_prediction.components.data_validation import DataValidation\nfrom us_visa_prediction.components.data_transformation import DataTransformation\nfrom us_visa_prediction.components.model_trainer import ModelTrainer\nfrom us_visa_prediction.components.model_evaluation import ModelEvaluation\nfrom us_visa_prediction.components.model_pusher import ModelPusher\n\nfrom us_visa_prediction.entity.config_entity import (DataIngestionConfig,\n                                          DataValidationConfig,\n                                          DataTransformationConfig,\n                                          ModelTrainerConfig,\n                  

### Embedding model

In [17]:
import getpass
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ··········


In [18]:
embeddings=OpenAIEmbeddings(disallowed_special=())

  warn_deprecated(


### Knowledge base (vector DB)

In [19]:
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./data')
vectordb.persist()

  warn_deprecated(


### LLM Wrapper

In [21]:
# llm = ChatOpenAI(model_name="gpt-4")
llm = ChatOpenAI()

  warn_deprecated(


In [22]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [23]:

qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":3}), memory=memory)

### Q&A

In [24]:
question = "what is happening in training pipeline?"

In [25]:
result = qa(question)
print(result['answer'])

  warn_deprecated(


In the training pipeline, the following steps are executed:

1. Data ingestion is started to bring in the necessary data.
2. Data validation is performed on the ingested data.
3. Data transformation is carried out on the validated data.
4. Model training is initiated using the transformed data.
5. Model evaluation is started to evaluate the trained model.
6. If the model is not accepted during evaluation, a message is logged and the process stops.
7. If the model is accepted, model pushing is started to push the model to a destination.

Each step in the pipeline is handled by specific methods in the `TrainPipeline` class, such as `start_data_ingestion()`, `start_data_validation()`, `start_data_transformation()`, `start_model_trainer()`, `start_model_evaluation()`, and `start_model_pusher()`. If any exception occurs during these steps, it is caught and handled by raising a `USvisaException`.
