<a href="https://colab.research.google.com/github/lesterrsantos/WebExample/blob/main/Transformacion_de_datos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pyspark==3.2.0

In [None]:
import pyspark
from pyspark.sql import SparkSession, SQLContext
sc = SparkSession.builder.appName("OperacionesDatos").getOrCreate()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
import numpy as np

sentenceData = sc.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Spark could use case classes"),
    (1.0, "Logistic regression models are neat"),
    (1.0, "I love Apache Spark and ML models")
], ["label", "sentence"])
sentenceData.show(truncate=False)

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
vectorizer  = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=5)

idf = IDF(inputCol="rawFeatures", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, vectorizer, idf])


model = pipeline.fit(sentenceData)
model.transform(sentenceData).show(truncate=False)


+-----+-----------------------------------+
|label|sentence                           |
+-----+-----------------------------------+
|0.0  |Hi I heard about Spark             |
|0.0  |I wish Spark could use case classes|
|1.0  |Logistic regression models are neat|
|1.0  |I love Apache Spark and ML models  |
+-----+-----------------------------------+

+-----+-----------------------------------+-------------------------------------------+-------------------------------+-------------------------------------------------------------+
|label|sentence                           |words                                      |rawFeatures                    |features                                                     |
+-----+-----------------------------------+-------------------------------------------+-------------------------------+-------------------------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]               |(5,[1,3],[3.0,

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceData)
tokenized.select("sentence", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)


+-----------------------------------+-------------------------------------------+------+
|sentence                           |words                                      |tokens|
+-----------------------------------+-------------------------------------------+------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]               |5     |
|I wish Spark could use case classes|[i, wish, spark, could, use, case, classes]|7     |
|Logistic regression models are neat|[logistic, regression, models, are, neat]  |5     |
|I love Apache Spark and ML models  |[i, love, apache, spark, and, ml, models]  |7     |
+-----------------------------------+-------------------------------------------+------+



In [None]:
from pyspark.ml.feature import StopWordsRemover

sentenceData3 = sc.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

remover = StopWordsRemover(inputCol="raw", outputCol="removeded")
remover.transform(sentenceData3).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |removeded           |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



In [None]:
from pyspark.ml.feature import Normalizer, StandardScaler, MinMaxScaler, MaxAbsScaler

from pyspark.ml.linalg import Vectors

dataFrame4 = sc.createDataFrame([
    (0, Vectors.dense([1.0, 4, 2]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scaledData = scaler.fit((dataFrame4)).transform(dataFrame4)
scaledData.show(truncate=False)

+---+--------------+----------------------------+
|id |features      |scaledFeatures              |
+---+--------------+----------------------------+
|0  |[1.0,4.0,2.0] |[0.0,0.3333333333333333,1.0]|
|1  |[2.0,1.0,1.0] |[0.3333333333333333,0.0,0.0]|
|2  |[4.0,10.0,2.0]|[1.0,1.0,1.0]               |
+---+--------------+----------------------------+



In [None]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = sc.createDataFrame(data, ["features"])

pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

+----------------------------------------+
|pcaFeatures                             |
+----------------------------------------+
|[1.6485728230883814,-4.0132827005162985]|
|[-4.645104331781533,-1.1167972663619048]|
|[-6.428880535676488,-5.337951427775359] |
+----------------------------------------+



In [None]:
sc.stop()