# glob.glob

The function `glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)` will return a list of file paths that match the specified pattern. It will return all files in the source directory (`source_dir`) and its subdirectories that have the specified file extension (`ext`).

If there are subdirectories within the source directory, the function will also search within those subdirectories recursively. It means that files in nested subdirectories will be included in the returned list if they have the specified file extension.

For example, let's say you have the following directory structure:

```
source_dir/
├── file1.txt
├── subdirectory1/
│   ├── file2.txt
│   └── subdirectory2/
│       └── file3.txt
└── subdirectory3/
    └── file4.txt
```

If you use `glob.glob(os.path.join(source_dir, f"**/*txt"), recursive=True)`, it will return a list containing the file paths for `file1.txt`, `subdirectory1/file2.txt`, `subdirectory1/subdirectory2/file3.txt`, and `subdirectory3/file4.txt`. It includes all the files with the `.txt` extension in the source directory and its subdirectories, including nested subdirectories.

So, to summarize, the function returns all files in the source directory and its subdirectories that match the specified file extension.

In [3]:
## Exmaple from privateGPT
import os
import glob
from typing import List

## good use of a dictonary as a factory

# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    # ".docx": (Docx2txtLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".eml": (UnstructuredEmailLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PDFMinerLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
    # Add more mappings for other file extensions and loaders as needed
}
class Document:
    hi = 2 
def load_documents(source_dir: str) -> List[Document]:
    """
    Search for all the files in the source_directory with the file extension.
    Retur
    """
    # Loads all documents from source documents directory
    all_files = []
    for ext in LOADER_MAPPING:
        
        
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
        )
    return all_files # [load_single_document(file_path) for file_path in all_files]


## compare this with load a single document

def load_single_document(file_path: str) -> Document:
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()[0]

    raise ValueError(f"Unsupported file extension '{ext}'")