In [1]:
import os
os.chdir("..")

In [2]:
import mimetypes
import os

from src.entity.artifact_entity import FileHandlerArtifact
from src.logger import get_logger


class ReadFiles:
    def __init__(self, file_handler_artifact: FileHandlerArtifact):
        self.file_handler_artifact = file_handler_artifact
        self.logger = get_logger(__name__)
        
        
    def check_file_type(self,file_path):
        # Define the allowed file types
        allowed_file_types = {
            'application/pdf': 'PDF',
            'application/vnd.ms-powerpoint': 'PPT',
            'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'PPTX',
            'application/msword': 'DOC',
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'DOCX',
            'image/png': 'PNG',
            'image/jpeg': 'JPG',
            'application/vnd.ms-excel': 'XLS',
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'XLSX',
            'text/csv': 'CSV',
            'text/markdown': 'MD',
            'text/html': 'HTML'
        }

        # Get the MIME type of the file
        mime_type, _ = mimetypes.guess_type(file_path)

        # Check if the MIME type is in the allowed file types
        if mime_type in allowed_file_types:
            return allowed_file_types[mime_type]
        else:
            return None
        
    def get_file_names_and_types(self) -> list[dict]:
        file_details = []
        for file in os.listdir(self.file_handler_artifact.file_storage_dir):
            file_full_path = os.path.join(self.file_handler_artifact.file_storage_dir,
                                          file)
            if os.path.isfile(file_full_path):
                file_details.append({
                    'filename': file,
                    "full_path":file_full_path,
                    'file_type': self.check_file_type(file_full_path)
                })
        return file_details
        
    
        
        

In [None]:

artifact = FileHandlerArtifact(
    file_storage_dir="artifacts/06_26_2024_12_48_34/file_storage/testmyne45"
)
rf = ReadFiles(artifact)

In [None]:
rf.get_file_names_and_types()

In [17]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


pdf_path = 'artifacts/06_26_2024_12_48_34/file_storage/testmyne45/myne-project-plan.pdf'

loader = PyPDFLoader(pdf_path, extract_images=True)
pages1 = loader.load()




In [2]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
    chunk_overlap=200, 
    separators=["\n\n", "\n", " ", ""])
splits = splitter.split_documents(pages1)

NameError: name 'RecursiveCharacterTextSplitter' is not defined

In [None]:
from langchain_community.document_loaders import AmazonTextractPDFLoader


loader = AmazonTextractPDFLoader("artifacts/06_26_2024_12_48_34/file_storage/testmyne45/screenshot-(3).png")
documents = loader.load()

In [3]:
import os, sys
from docx2pdf import convert

from src.exception import CustomException
from langchain_text_splitters.base import TextSplitter
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredExcelLoader,
    UnstructuredCSVLoader,
    UnstructuredMarkdownLoader,
    UnstructuredPowerPointLoader,
    UnstructuredHTMLLoader,
    UnstructuredImageLoader,
    ImageCaptionLoader,
    Docx2txtLoader,
    AmazonTextractPDFLoader)
from langchain_text_splitters import RecursiveCharacterTextSplitter


def load_and_split_document(loader_class, file_path, splitter: TextSplitter, **loader_kwargs):
    """
    Generic function to load and split documents using a specified loader class and text splitter.


    :param loader_class: The document loader class to use.
    :param file_path: The path to the document file.
    :param splitter: The text splitter to use for splitting the document.
    :param loader_kwargs: Additional keyword arguments to pass to the loader.
    :return: Chunks of documents.
    """
    try:
        loader = loader_class(file_path, **loader_kwargs)
        docs = loader.load()
        doc_splits = splitter.split_documents(docs)
        return doc_splits
    except Exception as e:
        raise CustomException(e, sys)


def read_pdf_pypdf(pdf_path, splitter: TextSplitter):
    """Read PDFs, implement OCR for images within PDF, and return a list of chunks of documents."""
    return load_and_split_document(PyPDFLoader, pdf_path, splitter, extract_images=True)

def read_with_aws(file_path, splitter: TextSplitter, **kwargs):
    """Read PDFs,images using amazon texract, and return a list of chunks of documents.
    This can read texts from images, 
    images within pdfs,
    text in pdfs,"""
    return load_and_split_document(AmazonTextractPDFLoader, file_path, splitter, **kwargs)


def read_txt(txt_path, splitter: TextSplitter):
    """Read text files and return a list of chunks of documents."""
    return load_and_split_document(TextLoader, txt_path, splitter)

# Additional functions for other document types can be implemented similarly.
def read_excel(excel_path, splitter: TextSplitter):
    """Read Excel files and return a list of chunks of documents."""
    return load_and_split_document(UnstructuredExcelLoader, excel_path, splitter)

def read_csv(csv_path, splitter: TextSplitter):
    """Read CSV files and return a list of chunks of documents."""
    return load_and_split_document(UnstructuredCSVLoader, csv_path, splitter)

def read_markdown(md_path, splitter: TextSplitter):
    """Read Markdown files and return a list of chunks of documents."""
    return load_and_split_document(UnstructuredMarkdownLoader, md_path, splitter)

def read_ppt(ppt_path, splitter: TextSplitter):
    """Read PowerPoint files and return a list of chunks of documents."""
    return load_and_split_document(UnstructuredPowerPointLoader, ppt_path, splitter)

# def read_docx(docx_path, splitter: TextSplitter):
#     """Read PowerPoint files and return a list of chunks of documents."""
#     #convert docx to pdf to extract texts from images within the doc effectively
#     file_dir = os.path.dirname(docx_path)
#     file_name = os.path.basename(docx_path).split('.')[0]
#     file_ext = ".pdf"
        
    
    
    
    # return load_and_split_document(Docx2txtLoader, docx_path, splitter)

def read_html(html_path, splitter: TextSplitter):
    """Read HTML files and return a list of chunks of documents."""
    return load_and_split_document(UnstructuredHTMLLoader, html_path, splitter)

def read_image(image_path, splitter: TextSplitter):
    """Read image files and return a list of chunks of documents."""
    return load_and_split_document(UnstructuredImageLoader, image_path, splitter)

def read_image_caption(image_path, splitter: TextSplitter):
    """Read image files and return a list of chunks of documents with captions."""
    return load_and_split_document(ImageCaptionLoader, image_path, splitter)


In [5]:
pdf_path = 'artifacts/06_26_2024_12_48_34/file_storage/testmyne45/myne-project-plan.pdf'
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
    chunk_overlap=200, 
    separators=["\n\n", "\n", " ", ""])
# splits = splitter.split_documents(pages1)

# docs = read_pdf(pdf_path=pdf_path,
#          splitter=splitter)

os.path.basename(pdf_path).split('.')[0]
        

'myne-project-plan'

In [6]:
docs

[Document(page_content='M y n e\nW e b - A p p l i c a t i o n \n( C u s t o m A I\nD r i v e n \nM o d e l )\nM a y\n1 4 ,\n2 0 2 4\nO v e r v i e w\nM y n e\nW e b\nA p p l i c a t i o n\ni s\na\np l a t f o r m\nf r o m\nw h i c h\nd i f f e r e n t\nt y p e s\no f\nu s e r s\nc a n\nl o g i n\na n d\nu p l o a d\nd o c u m e n t s\nu s i n g\nO p e n\nA I ,\na n d\nt h e\ns y s t e m\nw i l l\nr e s p o n d\nt o\nc l i e n t\nq u e r i e s\nb a s e d\no n\nu p l o a d e d\nd o c u m e n t s .\nS o m e\np r e d e ﬁ n e d\nt e m p l a t e\nm o d e l s\nw i l l\nb e\nt h e r e\nt o\nr e p r e s e n t\nt h e\nq u e r y\nr e s p o n s e s .\nT h e r e\nw i l l\nb e\nf o l l o w i n g\nu s e r\nr o l e s\nf o r\nt h e\nw e b - p o r t a l :\n-\nSuper\nAdmin\n-controls\nthe\nentire\nsystem\nand\nhas\nthe\nability\nto\nturn\non/off,\nlimit\naccess.\nThese\nfunctions\nshould\nbe\nlinked\nto\nthe\nCXM.\n*Need\nto\nfigure\nhow\nto\nlink\nthis\naccess\nto\nthe\nnew\nCXM*\n-\nClient\nAdmin\n-\n

In [5]:
# # Install packages
# %pip install unstructured
# %pip install python-magic
# %pip install python-pptx
read_ppt("research/test_docs/Test Case112923  Compared to Meta CSV File.ppt",
         splitter)

[Document(page_content='Test Results\n\nData\n\nRevenues\n\nThe following table sets forth our combined statements of income data (in thousands):\n\nWe had revenue of \x0b$ 116,609 in 2011\n\nWe had revenue of \n\n$ 117,029 in 2025\n\nWe had revenue of $85,965 in 2020', metadata={'source': 'research/test_docs/Test Case112923  Compared to Meta CSV File.ppt'})]

In [9]:
read_html("research/test_docs/test.html", splitter)

[Document(page_content="Upload a File\n\nHey how are you doing? We are testing the html file loading. By the way,let's talk about langchain.\n\n        LangChain is a framework designed for developing applications powered by language models. It provides a suite of tools and abstractions to facilitate the creation and integration of language models into various applications, especially those involving complex workflows and tasks. LangChain helps streamline the process of connecting language models with data sources, APIs, and other components necessary for building sophisticated language-driven applications.\n\nKey features of LangChain include:\n\nIntegration with Various Language Models: LangChain supports integration with multiple language models, making it easier to switch between models or use multiple models within a single application.", metadata={'source': 'research/test_docs/test.html'}),
 Document(page_content='Integration with Various Language Models: LangChain supports integ

In [10]:
read_txt("research/test_docs/Contractor Guidelines_ort.txt", splitter)

[Document(page_content='CONTRACTOR’S RESPONSIBILITY FOR PROJECT SAFETY\nGuidance\nContract \n1. Contractor recognizes the importance of performing the Work in a safe and responsible manner so as to prevent damage, injury, or loss to individuals, the environment, and the Work, including materials and equipment incorporated into the Work or stored on-site or off-site. Contractor assumes responsibility for implementing and monitoring all Environment, Health & Safety (EHS) precautions and programs related to the performance of the Work.\n2. Contractor and Subcontractors shall comply with all legal and Owner-specific reporting', metadata={'source': 'research/test_docs/Contractor Guidelines_ort.txt'}),
 Document(page_content='2. Contractor and Subcontractors shall comply with all legal and Owner-specific reporting\nrequirements relating to EHS set forth in the Contract Documents. Contractor will immediately report oral, and in writing within two (2) days, any EHS related injury, loss, damage

In [9]:
docs = read_docx("research/test_docs/112423 Google Drive Folder Test Outline v1 (1).docx", splitter)

In [10]:
docs

[Document(page_content='Tron Electrical & Automation: Google Review Campaign\n\nThe purpose of the Google Review Campaign was to gain immediate feedback from customers about the technicians’ service. The intrinsic benefit to the campaign is growing Tron’s email marketing list.\n\nCampaign Duration: May 7th, 2023 to May 31st, 2023. \n\nThe performance metrics available are listed below:\n\nNew Subscribers\n\n\n\nSubscriber1@email.com\n\nSubscriber2@email.com\n\nSubscriber3@email.com\n\nSubscriber4@email.com\n\n\n\nThe number of submissions during the Period is 17.\n\nThe technician that used the Google Review campaign on the DCard was John Ricupero with 82.4% of responses.\n\nAll of these email addresses alongside the name of the technician that submitted the information is stored in the Tron email marketing account.', metadata={'source': 'research/test_docs/112423 Google Drive Folder Test Outline v1 (1).docx'}),
 Document(page_content='All of these email addresses alongside the name of

In [11]:
print(docs[1].page_content)

All of these email addresses alongside the name of the technician that submitted the information is stored in the Tron email marketing account.

Recommendations.
Incentive Program. To encourage the other technicians to participate in the campaigns. Restructuring the current incentive package could help. Without direct encouragement, the Organization is unable to confirm whether the technician is properly introducing themselves before and after the service is provided. Strong customer service is the backbone of Tron’s success. As such, reinforcing this core value throughout its client engagements is essential.

Paid Account. Transitioning to a paid email account will increase the scope of the automations that can be leveraged on Tron’s behalf. Further, deeper metrics and learning is available to the marketing team.


Campaign Structure
Source: DCard





 



Contact Stored in Marketing 
Database for Retargeting

Contact Stored in Marketing 
Database for Retargeting


In [5]:
from langchain_community.document_loaders import S3DirectoryLoader

loader = S3DirectoryLoader("testmyne45")

In [6]:
loader.load()

The MIME type of '/tmp/tmpwmdxv3nx/myne-user-onboarding.mp4' is 'video/webm'. This file type is not currently supported in unstructured.


ValueError: Invalid file /tmp/tmpwmdxv3nx/myne-user-onboarding.mp4. The FileType.UNK file type is not supported in partition.

In [3]:
!pip install amazon-textract-textractor 

Collecting amazon-textract-textractor
  Downloading amazon_textract_textractor-1.8.2-py3-none-any.whl.metadata (9.7 kB)
Collecting editdistance<0.9,>=0.6.2 (from amazon-textract-textractor)
  Downloading editdistance-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading amazon_textract_textractor-1.8.2-py3-none-any.whl (307 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.6/307.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m[31m1.9 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading editdistance-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (401 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.8/401.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m[31m5.8 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: editdistance, amazon-textract-textractor
Successfully installed amazon-textract-textractor-1.8.2 editdistance-0.8.1


In [13]:
from langchain_community.document_loaders import AmazonTextractPDFLoader

from docx2pdf import convert


loader = AmazonTextractPDFLoader("s3://testmyne45/112423 Google Drive Folder Test Outline v1.pdf")
documents = loader.load()

In [18]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


pdf_path = 'research/test_docs/112423 Google Drive Folder Test Outline v1.pdf'

loader = PyPDFLoader(pdf_path, extract_images=True)
pages1 = loader.load()




In [29]:
pdf_path_s3 = "s3://testmyne45/112423 Google Drive Folder Test Outline v1.pdf"
read_pdf_aws(pdf_path_s3, splitter)

[Document(page_content="WG\n\n\nTron Electrical & Automation: Google Review Campaign\n\n\nThe purpose of the Google Review Campaign was to gain immediate feedback from\n\n\ncustomers about the technicians' service. The intrinsic benefit to the campaign is\n\n\ngrowing Tron's email marketing list.\n\n\nCampaign Duration: May 7th, 2023 to May 31st, 2023.\n\n\nThe performance metrics available are listed below:\n\n\nNew Subscribers\n\n\nSubscriber1@email.com\n\n\nSubscriber2@email.com\n\n\nSubscriber3@email.com\n\n\nSubscriber4@email.com\n\n\nThe number of submissions during the Period is 17.\n\n\nThe technician that used the Google Review campaign on the DCard was John\n\n\nRicupero with 82.4% of responses.\n\n\nAll of these email addresses alongside the name of the technician that submitted\n\n\nthe information is stored in the Tron email marketing account.\n\n\nRecommendations.\n\n\nIncentive Program. To encourage the other technicians to participate in the\n\n\ncampaigns. Restructurin

In [10]:
read_pdf_aws(images_path, splitter)

[Document(page_content='Contact\n\n\nHrisikesh Neogi\n\n\nhriskeshneogi@gmail.com\n\n\nSoftware Engineer@CN I Gen All I NLP I LLM I LangChain I Prompt\n\n\nwwwJinkedin.com/in/hrisikesh-\n\n\nEngineering I Machine Learning I YouTuber @HrisikeshUnleashesAl\n\n\nneogi (Linked In)\n\n\nBengaluru Karnataka India\n\n\nwww.facebook.cam/dishi0163\n\n\n(Personal)\n\n\nSummary\n\n\nTop Skills\n\n\nA combination of Arts and Science. a data enthusiast with a passion\n\n\nGt\n\n\nfor data-driven problem-solving. Prior a B.A. student, now A Data\n\n\nvector db\n\n\nGeek Where B.A gives pleasure. Data Science gives a joy of\n\n\nlangchain\n\n\ncreativity, the power of A.I. and both the Arts and Science, create a\n\n\nhuge pillar for building any solutions\n\n\nA Bengali by heart An Indian by culture.\n\n\nExperience\n\n\nCapital Numbers\n\n\nSoftware Engineer ( Gen Al/AIML)\n\n\nMay 2024 - Present (2 months)\n\n\nKolkata West Bengal India\n\n\nPW (PhysicsWallah)\n\n\nAssociate Manager and Data Scient