Skip to content

Commit

Permalink
Add back compatibilities
Browse files Browse the repository at this point in the history
  • Loading branch information
pprados committed Aug 16, 2023
1 parent 1bfe4f7 commit 9e68719
Show file tree
Hide file tree
Showing 9 changed files with 707 additions and 194 deletions.
280 changes: 250 additions & 30 deletions docs/extras/integrations/document_loaders/google_drive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "6e40071c-3a65-4e26-b497-3e2be0bd86b9",
"metadata": {},
"outputs": [],
Expand All @@ -44,18 +44,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "9bcb6cb1",
"metadata": {},
"outputs": [],
"source": [
"folder_id='root'\n",
"#folder_id='1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5'"
"#folder_id='1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5'\n",
"#folder_id='18A21b37hPISOQtStQ_irQLYS3hlVEsBH'"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "878928a6-a5ae-4f74-b351-64e3b01733fe",
"metadata": {
"tags": []
Expand All @@ -67,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "2216c83f-68e4-4d2f-8ea2-5878fb18bbe7",
"metadata": {
"tags": []
Expand Down Expand Up @@ -111,22 +112,69 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "1bca45c9",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: unstructured in /home/pprados/.pythonapp/lib/python3.10/site-packages (0.9.1)\n",
"Requirement already satisfied: python-magic in /home/pprados/.pythonapp/lib/python3.10/site-packages (from unstructured) (0.4.27)\n",
"Requirement already satisfied: tabulate in /home/pprados/.pythonapp/lib/python3.10/site-packages (from unstructured) (0.9.0)\n",
"Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from unstructured) (2.25.1)\n",
"Requirement already satisfied: filetype in /home/pprados/.pythonapp/lib/python3.10/site-packages (from unstructured) (1.2.0)\n",
"Requirement already satisfied: nltk in /home/pprados/.pythonapp/lib/python3.10/site-packages (from unstructured) (3.8.1)\n",
"Requirement already satisfied: lxml in /home/pprados/.pythonapp/lib/python3.10/site-packages (from unstructured) (4.9.3)\n",
"Requirement already satisfied: chardet in /usr/lib/python3/dist-packages (from unstructured) (4.0.0)\n",
"Requirement already satisfied: click in /usr/lib/python3/dist-packages (from nltk->unstructured) (8.0.3)\n",
"Requirement already satisfied: joblib in /home/pprados/.pythonapp/lib/python3.10/site-packages (from nltk->unstructured) (1.3.1)\n",
"Requirement already satisfied: tqdm in /home/pprados/.pythonapp/lib/python3.10/site-packages (from nltk->unstructured) (4.65.2)\n",
"Requirement already satisfied: regex>=2021.8.3 in /home/pprados/.pythonapp/lib/python3.10/site-packages (from nltk->unstructured) (2023.8.8)\n"
]
}
],
"source": [
"!pip install unstructured"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "8f3b6aa0-b45d-4e37-8c50-5bebe70fdb9d",
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---\n",
"[\n",
"\n",
"NOM\n",
"\n",
"PROJET + Référence Mission]\n",
"\n",
"\n",
"\n",
"Une fois la fiche ter...\n",
"---\n",
"[\n",
"\n",
"NOM\n",
"\n",
"PROJET + Référence Mission]\n",
"\n",
"\n",
"\n",
"Une fois la fiche ter...\n"
]
}
],
"source": [
"for doc in loader.load():\n",
" print(\"---\")\n",
Expand All @@ -153,7 +201,7 @@
"| gdrive-all-in-folder | Return all compatible files from a `folder_id` |\n",
"| gdrive-query | Search `query` in all drives |\n",
"| gdrive-by-name | Search file with name `query` |\n",
"| gdrive-query-in-folder | Search `query` in `folder_id` (and sub-folders in `_recursive=true`) |\n",
"| gdrive-query-in-folder | Search `query` in `folder_id` (and sub-folders if `recursive=true`) |\n",
"| gdrive-mime-type | Search a specific `mime_type` |\n",
"| gdrive-mime-type-in-folder | Search a specific `mime_type` in `folder_id` |\n",
"| gdrive-query-with-mime-type | Search `query` with a specific `mime_type` |\n",
Expand All @@ -162,10 +210,25 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "0a47175f",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---\n",
"A Document with the word machine learning.\n",
"\n",
"\n",
"\n",
"Another paragr...\n",
"---\n",
"Autre document sur le machine learning...\n"
]
}
],
"source": [
"loader = GoogleDriveLoader(\n",
" folder_id=folder_id,\n",
Expand All @@ -174,16 +237,7 @@
" query=\"machine learning\",\n",
" num_results=2, # Maximum number of file to load\n",
" supportsAllDrives=False, # GDrive `list()` parameter\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "100cf361",
"metadata": {},
"outputs": [],
"source": [
")\n",
"for doc in loader.load():\n",
" print(\"---\")\n",
" print(doc.page_content.strip()[:60]+\"...\")"
Expand All @@ -199,10 +253,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"id": "dcf07ff7",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---\n",
"Je vous invite à lire\n",
"\n",
"cette page\n",
"\n",
"pour suivre les recommand...\n",
"---\n",
"The Springer Series on Challenges in Machine Learning\n",
"\n",
"Frank...\n"
]
}
],
"source": [
"from langchain.prompts.prompt import PromptTemplate\n",
"loader = GoogleDriveLoader(\n",
Expand Down Expand Up @@ -230,7 +301,7 @@
"\n",
"The parameter `mode` accept differents values:\n",
"- `\"document\"`: return the body of each documents\n",
"- `\"snippets\"`: return the `description` of each files.\n",
"- `\"snippets\"`: return the `description` of each files (set in metadata of google drive files).\n",
"\n",
"\n",
"The parameter `gslide_mode` accept differents values:\n",
Expand All @@ -241,10 +312,33 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "b33d1a53",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---\n",
"Expériences :\n",
"\n",
"UX Researcher et UX Analyst chez OCTO Techno...\n",
"---\n",
"Mini-bio : FEBO\n",
"\n",
"\n",
"\n",
"En tant que\n",
"\n",
"UX researcher\n",
"\n",
"et\n",
"\n",
"UX analyt...\n"
]
}
],
"source": [
"loader = GoogleDriveLoader(\n",
" template=\"gdrive-mime-type\",\n",
Expand All @@ -269,10 +363,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "884c4ca6",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---\n",
"Instructions for use: 1. Make a copy of this document; do no...\n",
"---\n",
"Instructions for use: 2. Check you're in the new copy and it...\n"
]
}
],
"source": [
"loader = GoogleDriveLoader(\n",
" template=\"gdrive-mime-type\",\n",
Expand All @@ -284,6 +389,121 @@
" print(\"---\")\n",
" print(doc.page_content.strip()[:60]+\"...\")"
]
},
{
"cell_type": "markdown",
"id": "1f0f8982",
"metadata": {},
"source": [
"# Advanced usage\n",
"- All google file have a 'description' in the metadata. This field can be use to memorize a summary of the document or others indexed tags (See method `lazy_update_description_with_summary()`).\n",
"- If you use the `mode=\"snippet\"`, only the description will be used for the body. Else, the `metadata['summary']` has the field.\n",
"- Sometime, a specific filter can be used to extract some information from the filename, to select some files with specific criteria. You can use a `filter`.\n",
"- Sometimes, many documents are returned. It's not necessary to have all documents in memory at the same time. You can use the *lazy* versions of methods, to get one document at a time.\n",
"It's better to use a complex query in place of a recursive search. For each folder, a query must be apply if you activate `recursive=True`."
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "23e07be2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Ignore 'Dossier - 01 - Introduction au Deep Learning.zip' with type 'application/zip'\n",
"Ignore 'Actualité - 01 - Deep learning et humanités numériques.zip' with type 'application/zip'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"---\n",
"A Document with the word machine learning.\n",
"\n",
"\n",
"\n",
"Another paragr...\n",
"---\n",
"Autre document sur le machine learning...\n",
"---\n",
"Matrice Cynefin x\n",
"\n",
"Machine Learning\n",
"\n",
"- Aller vite en product...\n",
"---\n",
"Deep Learning humanités numériques\n",
"\n",
"“les gens qui se forment...\n",
"---\n",
"Eli Stevens Luca Antiga Thomas Viehmann Foreword by Soumith ...\n",
"---\n",
"Deep Learning et humanités numériques\n",
"\n",
"Karim Sayadi\n",
"\n",
"Data Sc...\n",
"---\n",
"Le machine learning portable avec Go\n",
"\n",
"\n",
"\n",
"Dans cet article, je...\n",
"---\n",
"Deep Learning humanités numériques\n",
"\n",
"\n",
"\n",
"“Les gens qui se forme...\n",
"---\n",
"01\n",
"\n",
"R&D collective ?\n",
"\n",
"\n",
"\n",
"<PAGE BREAK>\n",
"\n",
"Synthèse d’une discuss...\n",
"---\n",
"L’i-PPR n°02\n",
"\n",
"\n",
"\n",
"Deep learning of Python\n",
"\n",
"V1.0 -\n",
"\n",
"5 Avril 201...\n"
]
}
],
"source": [
"import os\n",
"loader = GoogleDriveLoader(\n",
" gdrive_api_file=os.environ[\"GOOGLE_ACCOUNT_FILE\"],\n",
" num_results=2,\n",
" template=\"gdrive-query\",\n",
" filter=lambda search, file: \"#test\" not in file.get('description',''),\n",
" query='machine learning',\n",
" supportsAllDrives=False,\n",
" )\n",
"for doc in loader.load():\n",
" print(\"---\")\n",
" print(doc.page_content.strip()[:60]+\"...\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3dbb5609",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -302,7 +522,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.11.0rc1"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 9e68719

Please sign in to comment.