huggingface · allisonwang-db · Nov 26, 2024 · Nov 25, 2024 · Nov 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
diff --git a/demo.ipynb b/demo.ipynb
@@ -0,0 +1,297 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "19b1960e-9e0a-401f-be15-d343902eaa21",
+   "metadata": {},
+   "source": [
+    "# Spark HuggingFace Connector Demo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9a7bf1d-c208-4873-9e06-5db981f8eeaa",
+   "metadata": {},
+   "source": [
+    "## Create a Spark Session"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "620d3ecb-b9cb-480c-b300-69198cce7a9c",
+   "metadata": {},
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "\n",
+    "spark = SparkSession.builder.getOrCreate()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f876028-2af5-4e63-8e9d-59afc0959267",
+   "metadata": {},
+   "source": [
+    "## Load a dataset as a Spark DataFrame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b8580bde-3f64-4c71-a087-8b3f71099aee",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-11-26T08:54:32.132099Z",
+     "start_time": "2024-11-26T08:54:28.903653Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df = spark.read.format(\"huggingface\").load(\"rotten_tomatoes\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3bbf61d1-4c2c-40e7-9790-2722637aac9d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- text: string (nullable = true)\n",
+      " |-- label: long (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "7f7b9a2b-8733-499a-af56-3c51196d060f",
+   "metadata": {},
+   "source": [
+    "# Cache the dataframe to avoid re-downloading data\n",
+    "df.cache()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "df121dba-2e1e-4206-b2bf-db156c298ee1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8530"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Trigger the cache computation\n",
+    "df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8866bdfb-0782-4430-8b1e-09c65e699f41",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Row(text='the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', label=1)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "0d9d3112-d19b-4fa8-a6fc-ba40816d1d11",
+   "metadata": {},
+   "source": [
+    "df.show(n=5)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "225bbbef-4164-424d-a701-c6c74494ef81",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4265"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Then you can operate on this dataframe\n",
+    "df.filter(df.label == 0).count()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3932f1fd-a324-4f15-86e1-bbe1064d707a",
+   "metadata": {},
+   "source": [
+    "## Load a different split\n",
+    "You can specify the `split` data source option:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a16e9270-eb02-4568-8739-db4dc715c274",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df = (\n",
+    "    spark.read.format(\"huggingface\")\n",
+    "    .option(\"split\", \"test\")\n",
+    "    .load(\"rotten_tomatoes\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3aec5719-c3a1-4d18-92c8-2b0c2f4bb939",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DataFrame[text: string, label: bigint]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_df.cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "d605289d-361d-4a6c-9b70-f7ccdff3aa9d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                "
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1066"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "df1ad003-1476-4557-811b-31c3888c0030",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+-----+\n",
+      "|                text|label|\n",
+      "+--------------------+-----+\n",
+      "|lovingly photogra...|    1|\n",
+      "|consistently clev...|    1|\n",
+      "|it's like a \" big...|    1|\n",
+      "|the story gives a...|    1|\n",
+      "|red dragon \" neve...|    1|\n",
+      "+--------------------+-----+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_df.show(n=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7f14b91-059e-4894-83d2-4ed74e0adaf9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pyspark_huggingface",
+   "language": "python",
+   "name": "pyspark_huggingface"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyspark_huggingface/__init__.py b/pyspark_huggingface/__init__.py
@@ -0,0 +1 @@
+from pyspark_huggingface.huggingface import HuggingFaceDatasets as DefaultSource
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from pyspark_huggingface.huggingface import HuggingFaceDatasets as DefaultSource