Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,4 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/
297 changes: 297 additions & 0 deletions demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "19b1960e-9e0a-401f-be15-d343902eaa21",
"metadata": {},
"source": [
"# Spark HuggingFace Connector Demo"
]
},
{
"cell_type": "markdown",
"id": "c9a7bf1d-c208-4873-9e06-5db981f8eeaa",
"metadata": {},
"source": [
"## Create a Spark Session"
]
},
{
"cell_type": "code",
"id": "620d3ecb-b9cb-480c-b300-69198cce7a9c",
"metadata": {},
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"spark = SparkSession.builder.getOrCreate()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "6f876028-2af5-4e63-8e9d-59afc0959267",
"metadata": {},
"source": [
"## Load a dataset as a Spark DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b8580bde-3f64-4c71-a087-8b3f71099aee",
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-26T08:54:32.132099Z",
"start_time": "2024-11-26T08:54:28.903653Z"
}
},
"outputs": [],
"source": [
"df = spark.read.format(\"huggingface\").load(\"rotten_tomatoes\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3bbf61d1-4c2c-40e7-9790-2722637aac9d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- text: string (nullable = true)\n",
" |-- label: long (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"id": "7f7b9a2b-8733-499a-af56-3c51196d060f",
"metadata": {},
"source": [
"# Cache the dataframe to avoid re-downloading data\n",
"df.cache()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"execution_count": 12,
"id": "df121dba-2e1e-4206-b2bf-db156c298ee1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8530"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Trigger the cache computation\n",
"df.count()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8866bdfb-0782-4430-8b1e-09c65e699f41",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"Row(text='the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', label=1)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"id": "0d9d3112-d19b-4fa8-a6fc-ba40816d1d11",
"metadata": {},
"source": [
"df.show(n=5)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"execution_count": 21,
"id": "225bbbef-4164-424d-a701-c6c74494ef81",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4265"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Then you can operate on this dataframe\n",
"df.filter(df.label == 0).count()"
]
},
{
"cell_type": "markdown",
"id": "3932f1fd-a324-4f15-86e1-bbe1064d707a",
"metadata": {},
"source": [
"## Load a different split\n",
"You can specify the `split` data source option:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a16e9270-eb02-4568-8739-db4dc715c274",
"metadata": {},
"outputs": [],
"source": [
"test_df = (\n",
" spark.read.format(\"huggingface\")\n",
" .option(\"split\", \"test\")\n",
" .load(\"rotten_tomatoes\")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "3aec5719-c3a1-4d18-92c8-2b0c2f4bb939",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[text: string, label: bigint]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df.cache()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "d605289d-361d-4a6c-9b70-f7ccdff3aa9d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"text/plain": [
"1066"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df.count()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "df1ad003-1476-4557-811b-31c3888c0030",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+-----+\n",
"| text|label|\n",
"+--------------------+-----+\n",
"|lovingly photogra...| 1|\n",
"|consistently clev...| 1|\n",
"|it's like a \" big...| 1|\n",
"|the story gives a...| 1|\n",
"|red dragon \" neve...| 1|\n",
"+--------------------+-----+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"test_df.show(n=5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7f14b91-059e-4894-83d2-4ed74e0adaf9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "pyspark_huggingface",
"language": "python",
"name": "pyspark_huggingface"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
1 change: 1 addition & 0 deletions pyspark_huggingface/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pyspark_huggingface.huggingface import HuggingFaceDatasets as DefaultSource
Loading