langchain[minor]: add azure ai data document loader (#13404)

This PR adds an "Azure AI data" document loader, which allows Azure AI users to load their registered data assets as a document object in langchain. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
langchain-ai · Dec 2, 2023 · fd781c8 · fd781c8
1 parent 24385a0
commit fd781c8
Show file tree

Hide file tree

Showing 4 changed files with 222 additions and 0 deletions.
diff --git a/docs/docs/integrations/document_loaders/azure_ai_data.ipynb b/docs/docs/integrations/document_loaders/azure_ai_data.ipynb
@@ -0,0 +1,174 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a634365e",
+   "metadata": {},
+   "source": [
+    "# Azure AI Data\n",
+    "\n",
+    ">[Azure AI Studio](https://ai.azure.com/) provides the capability to upload data assets to cloud storage and register existing data assets from the following sources:\n",
+    "\n",
+    "- Microsoft OneLake\n",
+    "- Azure Blob Storage\n",
+    "- Azure Data Lake gen 2\n",
+    "\n",
+    "The benefit of this approach over `AzureBlobStorageContainerLoader` and `AzureBlobStorageFileLoader` is that authentication is handled seamlessly to cloud storage. You can use either *identity-based* data access control to the data or *credential-based* (e.g. SAS token, account key). In the case of credential-based data access you do not need to specify secrets in your code or set up key vaults - the system handles that for you.\n",
+    "\n",
+    "This notebook covers how to load document objects from a data asset in AI Studio."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49815096",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install azureml-fsspec, azure-ai-generative"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2f0cd6a5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from azure.ai.resources.client import AIClient\n",
+    "from azure.identity import DefaultAzureCredential\n",
+    "from langchain.document_loaders import AzureAIDataLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08d40b11-e87a-426e-a6b0-89f24e47ce2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a connection to your project\n",
+    "client = AIClient(\n",
+    "    credential=DefaultAzureCredential(),\n",
+    "    subscription_id=\"<subscription_id>\",\n",
+    "    resource_group_name=\"<resource_group_name>\",\n",
+    "    project_name=\"<project_name>\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "321cc7f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get the latest version of your data asset\n",
+    "data_asset = client.data.get(name=\"<data_asset_name>\", label=\"latest\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25d91cea-c5f2-4a53-ac19-442810451ec6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load the data asset\n",
+    "loader = AzureAIDataLoader(url=data_asset.path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2b11d155",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpaa9xl6ch/fake.docx'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "loader.load()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0690c40a",
+   "metadata": {},
+   "source": [
+    "## Specifying a glob pattern\n",
+    "You can also specify a glob pattern for more finegrained control over what files to load. In the example below, only files with a `pdf` extension will be loaded."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "72d44781",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = AzureAIDataLoader(url=data_asset.path, glob=\"*.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2d3c32db",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': '/var/folders/y6/8_bzdg295ld6s1_97_12m4lr0000gn/T/tmpujbkzf_l/fake.docx'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "885dc280",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py
@@ -34,6 +34,9 @@
 from langchain.document_loaders.assemblyai import AssemblyAIAudioTranscriptLoader
 from langchain.document_loaders.async_html import AsyncHtmlLoader
 from langchain.document_loaders.azlyrics import AZLyricsLoader
+from langchain.document_loaders.azure_ai_data import (
+    AzureAIDataLoader,
+)
 from langchain.document_loaders.azure_blob_storage_container import (
     AzureBlobStorageContainerLoader,
 )
@@ -226,6 +229,7 @@
     "ArxivLoader",
     "AssemblyAIAudioTranscriptLoader",
     "AsyncHtmlLoader",
+    "AzureAIDataLoader",
     "AzureBlobStorageContainerLoader",
     "AzureBlobStorageFileLoader",
     "BSHTMLLoader",

diff --git a/libs/langchain/langchain/document_loaders/azure_ai_data.py b/libs/langchain/langchain/document_loaders/azure_ai_data.py
@@ -0,0 +1,43 @@
+from typing import Iterator, List, Optional
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import UnstructuredFileIOLoader
+
+
+class AzureAIDataLoader(BaseLoader):
+    """Load from Azure AI Data."""
+
+    def __init__(self, url: str, glob: Optional[str] = None):
+        """Initialize with URL to a data asset or storage location
+        ."""
+        self.url = url
+        """URL to the data asset or storage location."""
+        self.glob_pattern = glob
+        """Optional glob pattern to select files. Defaults to None."""
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        return list(self.lazy_load())
+
+    def lazy_load(self) -> Iterator[Document]:
+        """A lazy loader for Documents."""
+        try:
+            from azureml.fsspec import AzureMachineLearningFileSystem
+        except ImportError as exc:
+            raise ImportError(
+                "Could not import azureml-fspec package."
+                "Please install it with `pip install azureml-fsspec`."
+            ) from exc
+
+        fs = AzureMachineLearningFileSystem(self.url)
+
+        if self.glob_pattern:
+            remote_paths_list = fs.glob(self.glob_pattern)
+        else:
+            remote_paths_list = fs.ls()
+
+        for remote_path in remote_paths_list:
+            with fs.open(remote_path) as f:
+                loader = UnstructuredFileIOLoader(file=f)
+                yield from loader.load()
diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_imports.py b/libs/langchain/tests/unit_tests/document_loaders/test_imports.py
@@ -22,6 +22,7 @@
     "ArxivLoader",
     "AssemblyAIAudioTranscriptLoader",
     "AsyncHtmlLoader",
+    "AzureAIDataLoader",
     "AzureBlobStorageContainerLoader",
     "AzureBlobStorageFileLoader",
     "BSHTMLLoader",