In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TRANSFORMERS_NO_TORCHVISION"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("Gold_NER_Embedding")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

In [4]:
gold_lang_path = "../../sanewsstorage/gold/articles_lang"

gold_df = spark.read.format("delta").load(gold_lang_path)

In [5]:
gold_df = gold_df.repartition(4)

In [6]:
import pandas as pd
import spacy

from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import ArrayType, StructType, StructField, StringType

OSError: [WinError 1114] A dynamic link library (DLL) initialization routine failed. Error loading "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\torch\lib\c10.dll" or one of its dependencies.

In [None]:
ner_schema = ArrayType(
    StructType([
        StructField("entity", StringType(), True),
        StructField("label", StringType(), True)
    ])
)

In [None]:
@pandas_udf(ner_schema)
def ner_udf(texts: pd.Series) -> pd.Series:

    nlp = spacy.load("xx_ent_wiki_sm")

    results = []

    for text in texts:

        if text is None or text.strip() == "":
            results.append([])
            continue

        doc = nlp(text)

        ents = [
            (ent.text, ent.label_)
            for ent in doc.ents
        ]

        results.append(ents)

    return pd.Series(results)

In [10]:
gold_df = gold_df.withColumn(
    "entities",
    ner_udf(col("clean_text"))
)

In [11]:
from sentence_transformers import SentenceTransformer
from pyspark.sql.types import ArrayType, FloatType

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
embedding_schema = ArrayType(FloatType())

In [13]:
@pandas_udf(embedding_schema)
def embedding_udf(texts: pd.Series) -> pd.Series:

    model = SentenceTransformer(
        "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    )

    text_list = texts.fillna("").tolist()

    vectors = model.encode(
        text_list,
        batch_size=32,
        show_progress_bar=False
    )

    return pd.Series(
        [vec.tolist() for vec in vectors]
    )

In [14]:
gold_df = gold_df.withColumn(
    "embedding",
    embedding_udf(col("clean_text"))
)

In [15]:
gold_df = gold_df.cache()
gold_df.count()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\spacy\__init__.py", line 6, in <module>
    from .errors import setup_default_warnings
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\spacy\errors.py", line 3, in <module>
    from .compat import Literal
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\spacy\compat.py", line 5, in <module>
    from thinc.util import copy_array
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\thinc\__init__.py", line 5, in <module>
    from .config import registry
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\thinc\config.py", line 5, in <module>
    from .types import Decorator
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\thinc\types.py", line 27, in <module>
    from .compat import cupy, has_cupy
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\thinc\compat.py", line 48, in <module>
    except ImportError:  # pragma: no cover
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\torch\__init__.py", line 280, in <module>
    _load_dll_libraries()
  File "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\torch\__init__.py", line 263, in _load_dll_libraries
    raise err
OSError: [WinError 1114] A dynamic link library (DLL) initialization routine failed. Error loading "c:\Users\Echelon\Desktop\re\sa-news\venv\lib\site-packages\torch\lib\c10.dll" or one of its dependencies.


In [None]:
from delta.tables import DeltaTable

gold_ml_path = "../../sanewsstorage/gold/articles_enriched"

if DeltaTable.isDeltaTable(spark, gold_ml_path):

    delta_table = DeltaTable.forPath(spark, gold_ml_path)

    (
        delta_table.alias("t")
        .merge(
            gold_df.alias("s"),
            "t.bronze_hash = s.bronze_hash"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:

    (
        gold_df.write
        .format("delta")
        .mode("overwrite")
        .save(gold_ml_path)
    )
