In [None]:
import os
import io
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
import pickle
from docx import Document
import pandas as pd
from pptx import Presentation
from langchain_community.document_loaders import PyPDFLoader

# Define the required scopes and authenticate
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

def authenticate():
    """Authenticate with Google Drive and return the service."""
    creds = None
    # The file token.pickle stores the user's access and refresh tokens.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    # Build the Drive API client
    service = build('drive', 'v3', credentials=creds)
    return service

def list_files_in_folder(service, folder_id):
    """List all files in a specific Google Drive folder."""
    query = f"'{folder_id}' in parents"
    results = service.files().list(q=query, fields="files(id, name)").execute()
    items = results.get('files', [])
    return items

def read_file_content(file_id, file_name, service):
    """Read the content of a file from Google Drive based on the file type."""
    content = ""
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    fh.seek(0)  # Go back to the beginning of the file

    # Check file extension to process accordingly
    if file_name.endswith(".pdf"):
        loader = PyPDFLoader(fh)
        content = "\n".join([page.page_content for page in loader.load_and_split()])
    elif file_name.endswith(".docx"):
        doc = Document(fh)
        content = "\n".join(p.text for p in doc.paragraphs)
    elif file_name.endswith(".txt"):
        content = fh.read().decode('utf-8')
    elif file_name.endswith((".xlsx", ".xls")):
        content = pd.read_excel(fh).to_string(index=False)
    elif file_name.endswith(".pptx"):
        presentation = Presentation(fh)
        content = "\n".join(shape.text for slide in presentation.slides for shape in slide.shapes if shape.has_text_frame)

    return content

def get_all_texts_from_drive_folder(service, folder_id):
    """Get text content from all files in a Google Drive folder."""
    all_texts = []
    files = list_files_in_folder(service, folder_id)
    for file in files:
        print(f"Reading {file['name']}...")
        content = read_file_content(file['id'], file['name'], service)
        all_texts.append(content)
    return all_texts

# Authenticate and create the service
service = authenticate()

# Specify the folder ID for 'project_documents' (replace this with your actual folder ID)
folder_id = 'YOUR_GOOGLE_DRIVE_FOLDER_ID'

# Get all texts from the folder
all_texts = get_all_texts_from_drive_folder(service, folder_id)

# Now you have all the texts from the files in the 'all_texts' list
