##**Google Drive Documents Question and Answering**

# Install Dependencies


In [None]:
!pip install python-docx
!pip install PyMuPDF
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install --upgrade langchain openai  -q
!pip install tiktoken -q
!pip install pinecone-client -q
!apt-get install poppler-utils
!pip install unstructured -q
!pip install unstructured[local-inference] -q
!pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2 -q

Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/239.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0
Collecting PyMuPDF
  Downloading PyMuPDF-1.23.22-cp310-none-manylinux2014_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.23.22 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.22-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.6/30.6 MB[0m [31m41.6 MB/s[0

# Authenticate with Google Drive in Colab




In [None]:
from google.colab import auth
auth.authenticate_user()

# Google Drive File Processing Script

 [This Python script interacts with the Google Drive API to retrieve and process various types of files from Google Drive. It exports content from Google Docs, extracts text from PDFs (or performs OCR if necessary), processes plain text files, extracts text from DOCX files, and retrieves and processes Python files.]


In [None]:
# from google.colab import auth
from googleapiclient.discovery import build
import io
from googleapiclient.http import MediaIoBaseDownload
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
from docx import Document

def get_folder_hierarchy(drive_service, folder_id):
    folder_hierarchy = []
    while folder_id:
        folder_info = drive_service.files().get(fileId=folder_id, fields="name, parents").execute()
        folder_name = folder_info["name"]
        folder_hierarchy.insert(0, folder_name)
        folder_id = folder_info.get("parents", [])[0] if "parents" in folder_info else None
    return folder_hierarchy

# Authenticate with Google Colab
# auth.authenticate_user()

# Build the Drive API service
drive_service = build('drive', 'v3')

# Retrieve a list of all files and folders without any filtering
response = drive_service.files().list(
    pageSize=1000,
    fields="files(id, name, mimeType, parents)"
).execute()

files = response.get('files', [])
original_text_file_names = set()
all_text_content = ""

# Initialize parent_folder_id outside of the loop
parent_folder_id = None

if not files:
    print("No files found in Google Drive.")
else:
    for file in files:
        file_name = file['name']
        file_id = file['id']
        mime_type = file.get('mimeType', '')
        parents = file.get('parents', [])

        # Update parent_folder_id if there are parents
        if parents:
            parent_folder_id = parents[0]

        # Initialize folder_name as None
        folder_name = None

        if parent_folder_id:
            # Fetch the name of the parent folder
            parent_folder_response = drive_service.files().get(fileId=parent_folder_id, fields="name")
            folder_name = parent_folder_response.execute().get("name", "")

            # Create a link to the parent folder
            parent_folder_link = f"https://drive.google.com/drive/folders/{parent_folder_id}"
        else:
            # If there are no parent folders, set link to None
            parent_folder_link = None

        # Get the folder hierarchy for the current file
        folder_hierarchy = get_folder_hierarchy(drive_service, parent_folder_id)
        folder_path = " > ".join(folder_hierarchy)

        # Initialize the text content variable
        text_content = ""

        if mime_type == 'application/vnd.google-apps.document':
            # Export Google Docs content as plain text
            request = drive_service.files().export_media(fileId=file_id, mimeType='text/plain')
            text_fh = io.BytesIO()
            downloader = MediaIoBaseDownload(text_fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            text_content = text_fh.getvalue().decode('utf-8')

            # Add folder path to each line of the Google Docs content
            text_lines = text_content.split('\n')
            text_content_with_path = "\n".join([f"Folder name : {folder_path} > {line}" for line in text_lines])
            all_text_content += f"Folder Name: {folder_name} {parent_folder_link}\n"
            all_text_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

            all_text_content += "Google Docs Content:\n"
            all_text_content += text_content_with_path + "\n"
            all_text_content += "=" * 40 + "\n"

        elif mime_type == 'application/pdf':
            # Download the PDF content
            request = drive_service.files().get_media(fileId=file_id)
            pdf_fh = io.BytesIO()
            downloader = MediaIoBaseDownload(pdf_fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            # Check if the PDF contains text (use PyMuPDF for this)
            pdf_content = pdf_fh.getvalue()
            pdf_document = fitz.open(stream=pdf_content, filetype="pdf")

            # Initialize a variable to store the extracted text
            extracted_text = ""

            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                page_text = page.get_text()
                extracted_text += page_text

            if extracted_text.strip():
                # Append the extracted text for the PDF file
                pdf_text_lines = extracted_text.split('\n')
                pdf_text_content_with_path = "\n".join([f"Folder name : {folder_path} > {line}" for line in pdf_text_lines])
                all_text_content += f"Folder Name: {folder_name} {parent_folder_link}\n"
                all_text_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

                all_text_content += "PDF Text Content:\n"
                all_text_content += pdf_text_content_with_path + "\n"
                all_text_content += "=" * 40 + "\n"
            else:
                # If the PDF does not contain text, attempt OCR
                all_text_content += f"Folder Name: {folder_name} {parent_folder_link}\n"
                all_text_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

                all_text_content += "Performing OCR on scanned PDF...\n"

                # Initialize a variable to store the OCR result
                ocr_result = ""

                for page_num in range(pdf_document.page_count):
                    page = pdf_document.load_page(page_num)
                    img = page.get_pixmap()
                    img_bytes = img.samples
                    img_text = pytesseract.image_to_string(Image.frombytes("RGB", [img.width, img.height], img_bytes))
                    ocr_result += img_text

                ocr_text_lines = ocr_result.split('\n')
                ocr_text_content_with_path = "\n".join([f"{folder_path} > {line}" for line in ocr_text_lines])
                all_text_content += f"OCR Result:\n"
                all_text_content += ocr_text_content_with_path + "\n"
                all_text_content += "=" * 40 + "\n"

        elif mime_type == 'text/plain':
            # Check if it's a text file and not processed before
            if file_name.endswith('.txt') and file_name not in original_text_file_names:
                # Download and append plain text content
                request = drive_service.files().get_media(fileId=file['id'])
                text_fh = io.BytesIO()
                downloader = MediaIoBaseDownload(text_fh, request)
                done = False
                while done is False:
                    status, done = downloader.next_chunk()

                text_content = text_fh.getvalue().decode('utf-8')
                text_lines = text_content.split('\n')
                text_content_with_path = "\n".join([f"Folder name : {folder_path} > {line}" for line in text_lines])

                all_text_content += f"Folder Name: {folder_name} {parent_folder_link}\n"
                all_text_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

                all_text_content += "Text File Content:\n"
                all_text_content += text_content_with_path + "\n"
                all_text_content += "=" * 40 + "\n"

                # Add the file name (without .txt extension) to the set of original text file names
                original_text_file_names.add(file_name)

        elif mime_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
            # Download the .docx content
            request = drive_service.files().get_media(fileId=file['id'])
            docx_fh = io.BytesIO()
            downloader = MediaIoBaseDownload(docx_fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            docx_content = docx_fh.getvalue()
            document = Document(io.BytesIO(docx_content))

            extracted_text = []

            for paragraph in document.paragraphs:
                extracted_text.append(paragraph.text)

            all_text_content += f"Folder Name: {folder_name} {parent_folder_link}\n"
            all_text_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

            all_text_content += "DOCX Text Content:\n"
            all_text_content += "\n".join(extracted_text) + "\n"
            all_text_content += "=" * 40 + "\n"

        elif file_name.endswith('.py'):
            # Download and process Python file content
            request = drive_service.files().get_media(fileId=file['id'])
            python_fh = io.BytesIO()
            downloader = MediaIoBaseDownload(python_fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            python_content = python_fh.getvalue().decode('utf-8')
            python_lines = python_content.split('\n')
            python_content_with_path = "\n".join([f"Folder name : {folder_path} > {line}" for line in python_lines])
            all_text_content += f"Folder Name: {folder_name} {parent_folder_link}\n"
            all_text_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

            all_text_content += "Python File Content:\n"
            all_text_content += python_content_with_path + "\n"
            all_text_content += "=" * 40 + "\n"


        # Add more MIME type processing here

# Print the accumulated text content for all files in Google Drive
print(all_text_content)
print(len(all_text_content))


Folder Name: Colab Notebooks https://drive.google.com/drive/folders/1PMTCq-RbU4U7dQ0uT-QxX3C6Fkn5RZ9Q
File Name: Food Menu Charcoal Surat (10J).pdf https://drive.google.com/file/d/1NtPt9J09_L5O5SC4exKMYkSK417f6KPs
PDF Text Content:
Folder name : My Drive > Colab Notebooks > 550
Folder name : My Drive > Colab Notebooks > 575
Folder name : My Drive > Colab Notebooks > K's Cheese Tortellini
Folder name : My Drive > Colab Notebooks > 550
Folder name : My Drive > Colab Notebooks > 550
Folder name : My Drive > Colab Notebooks > 550
Folder name : My Drive > Colab Notebooks > 550
Folder name : My Drive > Colab Notebooks > 550
Folder name : My Drive > Colab Notebooks > 550
Folder name : My Drive > Colab Notebooks > (Available in pomodoro, Basil Pesto, Alfredo,
Folder name : My Drive > Colab Notebooks > Lemon Butter Garlic)
Folder name : My Drive > Colab Notebooks > R I S O T T O
Folder name : My Drive > Colab Notebooks > Porcini Mushroom Risotto
Folder name : My Drive > Colab Notebooks > Risott

# Set OpenAI API Key
This code cell sets the OpenAI API key as an environment variable, which is required for authentication.

Replace the placeholder with your actual OpenAI API key.

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "*****" # write your own

# Creating OpenAI Text Embeddings

This code cell initializes OpenAI text embeddings using the "text-embedding-ada-002" model.

It also specifies a maximum number of retries for API requests.

Replace the model name and adjust the number of retries as needed for your use case.

In [None]:
# Creating Embeddings
import openai
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",max_retries = 5) # Maximum number of retries

# Chunking Documents for Text Analysis

This code cell defines a function to split a large text document into smaller chunks for text analysis.

It uses the RecursiveCharacterTextSplitter with specified chunk size and overlap parameters.

Replace the chunk_size and chunk_overlap values to customize the chunking process as needed.


In [None]:
# Chunking documents
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(all_text_content,chunk_size=2100,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_text(all_text_content)
  return docs

docs = split_docs(all_text_content)
print(len(docs))

119


# Initialize Pinecone and Create an Index

This code cell initializes Pinecone with your API key and specifies the environment.

It also sets up an index named "trial" for use with Pinecone.

Replace the API key and environment as needed based on your Pinecone configuration.


In [None]:
import pinecone
from langchain.vectorstores import Pinecone
# initialize pinecone
pinecone.init(
    api_key="***********************",  # find at app.pinecone.io ,WRITE YOUR OWN
    environment="asia-southeast1-gcp-free"  # next to api key in console,WRITE YOUR OWN
)

index_name = "trial"

# Store Pinecone Index with Text Embeddings


In [None]:
index = Pinecone.from_texts(docs, embeddings, index_name=index_name)

# Initialize ChatOpenAI Language Model

 This code cell initializes a language model for chat-based interactions using the 'ChatOpenAI' class.

 You can specify the desired model name (e.g., "gpt-3.5-turbo" or "gpt-4").

 Ensure you have the required library or package (in this case, 'langchain') installed before running this cell.

 Model Options:
 - "text-davinci-003" (Davinci model)
 - "gpt-3.5-turbo" (GPT-3.5 Turbo model)
 - "gpt-4" (GPT-4 model)

In [None]:
# from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
# model_name = "text-davinci-003"
model_name = "gpt-3.5-turbo"
# model_name = "gpt-4"
llm = ChatOpenAI(model_name=model_name)

# Set up Question Answering with RetrievalQA

This code cell sets up a Question Answering (QA) system using the 'RetrievalQA' class.

It integrates your language model ('llm') and a Pinecone index ('docsearch') for document retrieval.

Make sure you have loaded the Pinecone index and initialized the language model before running this cell.


In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)

# Create RetrievalQA object
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())


# Querying the Question Answering System

In [None]:
query="generate a detailed summary of LCV Ensembled Bagging model"
result = qa.run(query)
print(result)

The LCV Ensembled Bagging model is a technique used to improve the performance and accuracy of machine learning algorithms. It involves using two or more homogeneous models to predict a single outcome. In the case of the LCV Ensembled Bagging model, it was used for detecting plants with disease.

Initially, the model was able to detect 7 out of 10 plants in a frame, resulting in around 70% accuracy. However, it was unable to perfectly distinguish between the scores. To address this issue, a bagging technique was applied, where two detection models were used in a single inference. This improved the performance of the model, allowing it to detect all 10 plants in a frame, achieving 100% accuracy for plant detection.

Despite the improvement in plant detection, the model still struggled with distinguishing between scores. To further enhance its capabilities, a classification model was added after the detection process. This allowed the model to clearly classify between scores, improving i

In [None]:
query="State the strategies for fixing failing tests"
result = qa.run(query)
print(result)

The strategies for fixing failing tests include:

1. Changing hyper-parameters: Adjusting the settings or parameters of the test to minimize flakiness.
2. Updating assertion bounds: Modifying the acceptable range of values used in the test for comparing the end-result.
3. Refactoring the assertion or test: Modifying the assertion statement or the test itself to improve its reliability.
4. Fixing code under test: Debugging and addressing any bugs or issues in the code that may be causing the test failures.
5. Manual investigation: For tests that cannot be automatically fixed, manually investigating and determining alternative fixes or sending bug reports to developers.


In [None]:
query="A new Phenotyping Documentation file is there, in which folder should it be moved? "
result = qa.run(query)
print(result)

The new Phenotyping Documentation file should be moved to the "My Drive > Test > PDF" folder.


In [None]:
query=" "
result = qa.run(query)what is the Folder name of Phenotyping Master Documentation (1).pdf?
print(result)

The folder name of "Phenotyping Master Documentation (1).pdf" is "My Drive > Test > PDF".


In [None]:
query="A new Python documentation file is there, in which folder moved to?  "
result = qa.run(query)
print(result)

The new Python documentation file is located in the "My Drive > Test > DOCS" folder.


In [None]:
query= "generate a detailed summary of LCV Ensembled bagging model (V1.1) from documents"
result = qa.run(query)
print(result)

The LCV Ensembled bagging model (V1.1) is a model that was used to detect plants with diseases. It was able to detect 7 out of 10 plants in a frame, with an accuracy of around 70%. However, the model was unable to perfectly distinguish between scores. The bagging technique, which is an ensemble learning technique, was used to improve the performance and accuracy of the model. By using 2 or more homogeneous models to predict a single outcome, the bagging technique helped to increase the performance of the model, resulting in the detection of all 10 plants in a frame with 100% accuracy.


## **--------------- Start to move the files to folder --------------**




In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from googleapiclient.discovery import build

# Replace 'YOUR_API_VERSION' with the desired API version (e.g., 'v3').
drive_service = build('drive', 'v3')

# List Files in the Root Directory of Google Drive


In [None]:
results = drive_service.files().list(q="'root' in parents", pageSize=50).execute()
items = results.get('files', [])

if not items:
    print('No files or folders found in the root directory.')
else:
    print('Files directly in the root directory:')
    for item in items:
        if item['mimeType'] != 'application/vnd.google-apps.folder':
            # It's a file, not within a folder
            print(f"File: {item['name']} ({item['mimeType']})")


Files directly in the root directory:
File: Week7_Industry Mentor Log Book.docx (application/vnd.openxmlformats-officedocument.wordprocessingml.document)


# List File Names in the Root Directory of Google Drive in list

In [None]:
results = drive_service.files().list(q="'root' in parents", pageSize=20).execute()
items = results.get('files', [])

file_name = []

if not items:
    file_name.append('No files or folders found in the root directory.')
else:
    file_name.append('Files directly in the root directory:')
    for item in items:
        if item['mimeType'] != 'application/vnd.google-apps.folder':
            # It's a file, not within a folder
            file_name.append(f"File: {item['name']} ({item['mimeType']})")

print(file_name)

['Files directly in the root directory:', 'File: Week7_Industry Mentor Log Book.docx (application/vnd.openxmlformats-officedocument.wordprocessingml.document)']


# Suggest Folders for Files in Google Drive and Extract Content (Chat-based QA)

This code cell suggests folders in Google Drive for each file in the root directory.

It utilizes a Chat-based Question Answering (QA) system to make folder suggestions based on file names.

If the user chooses to move a file, it can be moved to the suggested folder.

Additionally, it extracts content from various file types, including Google Docs, PDFs, plain text, DOCX, and more.

 Make sure to configure your Google Drive API service, Pinecone index, and language model ('llm').




In [None]:
from langchain.chains import RetrievalQA
from googleapiclient.discovery import build
import os
import re
from langchain.prompts import PromptTemplate
import io
from googleapiclient.http import MediaIoBaseDownload
from docx import Document
from PIL import Image
import pytesseract
import fitz

def suggest_folders_for_files(drive_service, index_name, llm):
    # List files in Google Drive
    results = drive_service.files().list(q="'root' in parents", pageSize=20).execute()
    items = results.get('files', [])

    if not items:
        print('No files or folders found in the root directory.')
        return

    # Initialize Pinecone index
    docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)

    template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Don't try to create a new URL, just suggest a URL from the context.
    {context}
    Question: {question}
    Helpful Answer:"""

    # Initialize a variable to store all the file contexts
    all_file_contexts = ""

    # Set to store original text file names (to avoid processing multiple times)
    original_text_file_names = set()

    for item in items:
        if item['mimeType'] != 'application/vnd.google-apps.folder':
            # It's a file, not within a folder
            file_id = item['id']
            file_name = item['name']

            # Define your question using the file_name
            question = f"Suggest a Folder name and its URL for the file '{file_name}'"

            QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

            # Use the existing 'llm' parameter instead of redefining it
            qa_chain = RetrievalQA.from_chain_type(
                llm,
                retriever=docsearch.as_retriever(),
                chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
            )

            # Run the question through the QA system
            result = qa_chain({"query": question})

            # Print the result for this file
            print(f"File: {file_name}")
            print(f"Suggested Folder: {result['result']}")

            # Ask the user whether to move the file to the suggested folder
            move_file = input("Do you want to move this file to the suggested folder? (yes/no): ").strip().lower()
            if move_file == 'yes':
                # Store the file's context before moving it
                file_context = f"File: {file_name}\nSuggested Folder: {result['result']}\n"

                # Accumulate the context in the 'all_file_contexts' variable
                all_file_contexts += file_context

                # Move the file to the suggested folder
                match = re.search(r'folders/([^/"]+)', result['result'])
                if match:
                    folder_id = match.group(1)
                    print(folder_id)
                else:
                    print("No match found.")
                move_file_to_folder(drive_service, file_id, folder_id)
                print(f"Moved '{file_name}' to folder '{result['result']}'\n")

                # Retrieve and print the content of the file
                file_content = get_file_content(drive_service, file_id, file_name, result['result'], original_text_file_names)
                all_file_contexts += file_content

            else:
                print(f"Skipping '{file_name}'...\n")

    # After processing all files, you can access the accumulated file contexts in 'all_file_contexts'
    return all_file_contexts

def move_file_to_folder(drive_service, file_id, folder_id):
    # Move the file to the specified folder
    try:
        file = drive_service.files().get(fileId=file_id, fields='parents').execute()
        previous_parents = ",".join(file.get('parents'))
        file = drive_service.files().update(fileId=file_id, addParents=folder_id, removeParents=previous_parents).execute()
    except Exception as e:
        print(f"An error occurred while moving the file: {str(e)}")

def get_file_content(drive_service, file_id, file_name, folder_path, original_text_file_names):
    try:
        file = drive_service.files().get(fileId=file_id).execute()
        mime_type = file['mimeType']

        # Initialize a variable to store the file content
        file_content = ""

        if mime_type == 'application/vnd.google-apps.document':
            # Export Google Docs content as plain text
            request = drive_service.files().export_media(fileId=file_id, mimeType='text/plain')
            text_fh = io.BytesIO()
            downloader = MediaIoBaseDownload(text_fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            text_content = text_fh.getvalue().decode('utf-8')

            # Add folder path to each line of the Google Docs content
            text_lines = text_content.split('\n')
            text_content_with_path = "\n".join([f"Folder name : {folder_path} > {line}" for line in text_lines])

            file_content += f"Folder Name: {folder_path}\n"
            file_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

            file_content += "Google Docs Content:\n"
            file_content += text_content_with_path + "\n"
            file_content += "=" * 40 + "\n"

        elif mime_type == 'application/pdf':
            # Download the PDF content
            request = drive_service.files().get_media(fileId=file_id)
            pdf_fh = io.BytesIO()
            downloader = MediaIoBaseDownload(pdf_fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            # Check if the PDF contains text (use PyMuPDF for this)
            pdf_content = pdf_fh.getvalue()
            pdf_document = fitz.open(stream=pdf_content, filetype="pdf")

            # Initialize a variable to store the extracted text
            extracted_text = ""

            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                page_text = page.get_text()
                extracted_text += page_text

            if extracted_text.strip():
                # Append the extracted text for the PDF file
                pdf_text_lines = extracted_text.split('\n')
                pdf_text_content_with_path = "\n".join([f"Folder name : {folder_path} > {line}" for line in pdf_text_lines])
                file_content += f"Folder Name: {folder_path}\n"
                file_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

                file_content += "PDF Text Content:\n"
                file_content += pdf_text_content_with_path + "\n"
                file_content += "=" * 40 + "\n"
            else:
                # If the PDF does not contain text, attempt OCR
                file_content += f"Folder Name: {folder_path}\n"
                file_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

                file_content += "Performing OCR on scanned PDF...\n"

                # Initialize a variable to store the OCR result
                ocr_result = ""

                for page_num in range(pdf_document.page_count):
                    page = pdf_document.load_page(page_num)
                    img = page.get_pixmap()
                    img_bytes = img.samples
                    img_text = pytesseract.image_to_string(Image.frombytes("RGB", [img.width, img.height], img_bytes))
                    ocr_result += img_text

                ocr_text_lines = ocr_result.split('\n')
                ocr_text_content_with_path = "\n".join([f"{folder_path} > {line}" for line in ocr_text_lines])
                file_content += f"OCR Result:\n"
                file_content += ocr_text_content_with_path + "\n"
                file_content += "=" * 40 + "\n"

        elif mime_type == 'text/plain':
            # Check if it's a text file and not processed before
            if file_name.endswith('.txt') and file_name not in original_text_file_names:
                # Download and append plain text content
                request = drive_service.files().get_media(fileId=file['id'])
                text_fh = io.BytesIO()
                downloader = MediaIoBaseDownload(text_fh, request)
                done = False
                while done is False:
                    status, done = downloader.next_chunk()

                text_content = text_fh.getvalue().decode('utf-8')
                text_lines = text_content.split('\n')
                text_content_with_path = "\n".join([f"Folder name : {folder_path} > {line}" for line in text_lines])

                file_content += f"Folder Name: {folder_path}\n"
                file_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

                file_content += "Text File Content:\n"
                file_content += text_content_with_path + "\n"
                file_content += "=" * 40 + "\n"

                # Add the file name (without .txt extension) to the set of original text file names
                original_text_file_names.add(file_name)

        elif mime_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
            # Download the .docx content
            request = drive_service.files().get_media(fileId=file['id'])
            docx_fh = io.BytesIO()
            downloader = MediaIoBaseDownload(docx_fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            docx_content = docx_fh.getvalue()
            document = Document(io.BytesIO(docx_content))

            extracted_text = []

            for paragraph in document.paragraphs:
                extracted_text.append(paragraph.text)

            file_content += f"Folder Name: {folder_path}\n"
            file_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

            file_content += "DOCX Text Content:\n"
            file_content += "\n".join(extracted_text) + "\n"
            file_content += "=" * 40 + "\n"

        elif file_name.endswith('.py'):
            # Download and process Python file content
            request = drive_service.files().get_media(fileId=file['id'])
            python_fh = io.BytesIO()
            downloader = MediaIoBaseDownload(python_fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            python_content = python_fh.getvalue().decode('utf-8')
            python_lines = python_content.split('\n')
            python_content_with_path = "\n".join([f"Folder name : {folder_path} > {line}" for line in python_lines])
            file_content += f"Folder Name: {folder_path}\n"
            file_content += f"File Name: {file_name} https://drive.google.com/file/d/{file_id}\n"

            file_content += "Python File Content:\n"
            file_content += python_content_with_path + "\n"
            file_content += "=" * 40 + "\n"

        # Add more MIME type processing here

        return file_content

    except Exception as e:
        print(f"An error occurred while getting the file content: {str(e)}")
        return "Error getting file content."

# Usage example:
# Replace these placeholders with actual values and configure your Google Drive API service
# drive_service = build('drive', 'v3', ...)
# index_name = 'your_pinecone_index_name'
# llm = 'your_llm_model_name'

# Call the function and store the accumulated file contexts
all_file_contexts = suggest_folders_for_files(drive_service, index_name, llm)

# Print all the files' content which were moved
print("All File Contents for Moved Files:")
print(all_file_contexts)


File: Week7_Industry Mentor Log Book.docx
Suggested Folder: Folder Name: WEEKLY_REPORTS 
URL: https://drive.google.com/drive/folders/1xFY4AdtqN4BXVLaIFrqTtH9Qc3EuCCoK
Do you want to move this file to the suggested folder? (yes/no): yes
1xFY4AdtqN4BXVLaIFrqTtH9Qc3EuCCoK
Moved 'Week7_Industry Mentor Log Book.docx' to folder 'Folder Name: WEEKLY_REPORTS 
URL: https://drive.google.com/drive/folders/1xFY4AdtqN4BXVLaIFrqTtH9Qc3EuCCoK'

All File Contents for Moved Files:
File: Week7_Industry Mentor Log Book.docx
Suggested Folder: Folder Name: WEEKLY_REPORTS 
URL: https://drive.google.com/drive/folders/1xFY4AdtqN4BXVLaIFrqTtH9Qc3EuCCoK
Folder Name: Folder Name: WEEKLY_REPORTS 
URL: https://drive.google.com/drive/folders/1xFY4AdtqN4BXVLaIFrqTtH9Qc3EuCCoK
File Name: Week7_Industry Mentor Log Book.docx https://drive.google.com/file/d/1fmjLmE8_MN_E2XlK1jaqlPBCHei07mGI
DOCX Text Content:
SVKM’s NMIMS University 
Mukesh Patel School of Technology Management & Engineering 
Department of Computer Engi

SAME PROCESS FOR STORING THE TEXT IN PINECONE

In [None]:
# Creating Embeddings for moved files
import openai
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",max_retries = 5)

In [None]:
# Chunking documents for the moved files.
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(all_file_contexts,chunk_size=2000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_text(all_file_contexts)
  return docs

docs = split_docs(all_file_contexts)
print(len(docs))

1


In [None]:
#STORED ALL THE MOVED FILE TEXT IN PINECONE
index = Pinecone.from_texts(docs, embeddings, index_name=index_name)