In [1]:
import findspark
findspark.init("/home/jean/spark-2.4.4-bin-hadoop2.7")
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('logisticregression').getOrCreate()

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [3]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [4]:
send_df = spark.createDataFrame([
    (0, "Hi I am testing spark"),
    (1, "Python is better than Java for machine learning"),
    (2, "Logistic,regression,testing")
], ["id", "sentence"])

In [5]:
send_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I am testing s...|
|  1|Python is better ...|
|  2|Logistic,regressi...|
+---+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [7]:
regex_tokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")

In [8]:
count_tokens = udf(lambda words:len(words), IntegerType())

In [9]:
tokenized = tokenizer.transform(send_df)

In [10]:
tokenized.withColumn("tokens", count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I am testing s...|[hi, i, am, testi...|     5|
|  1|Python is better ...|[python, is, bett...|     8|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [11]:
rg_tokenized = regex_tokenizer.transform(send_df)

In [12]:
rg_tokenized.withColumn("tokens", count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I am testing s...|[hi, i, am, testi...|     5|
|  1|Python is better ...|[python, is, bett...|     8|
|  2|Logistic,regressi...|[logistic, regres...|     3|
+---+--------------------+--------------------+------+



In [13]:
from pyspark.ml.feature import StopWordsRemover

In [16]:
remover = StopWordsRemover(inputCol="words", outputCol="removed")

In [17]:
remover.transform(rg_tokenized).show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|             removed|
+---+--------------------+--------------------+--------------------+
|  0|Hi I am testing s...|[hi, i, am, testi...|[hi, testing, spark]|
|  1|Python is better ...|[python, is, bett...|[python, better, ...|
|  2|Logistic,regressi...|[logistic, regres...|[logistic, regres...|
+---+--------------------+--------------------+--------------------+



In [18]:
from pyspark.ml.feature import NGram

In [25]:
ngram = NGram(n=2, inputCol="removed", outputCol="ngram")

In [28]:
ngram.transform(remover.transform(rg_tokenized)).select(["words", "ngram"]).show(truncate=False)

+--------------------------------------------------------+------------------------------------------------------------+
|words                                                   |ngram                                                       |
+--------------------------------------------------------+------------------------------------------------------------+
|[hi, i, am, testing, spark]                             |[hi testing, testing spark]                                 |
|[python, is, better, than, java, for, machine, learning]|[python better, better java, java machine, machine learning]|
|[logistic, regression, testing]                         |[logistic regression, regression testing]                   |
+--------------------------------------------------------+------------------------------------------------------------+



In [29]:
from pyspark.ml.feature import HashingTF, IDF

In [31]:
rg_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I am testing s...|[hi, i, am, testi...|
|  1|Python is better ...|[python, is, bett...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [32]:
hashing_tf = HashingTF(inputCol="words", outputCol="raw_features")

In [33]:
featurized_data = hashing_tf.transform(rg_tokenized)

In [34]:
featurized_data.show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|        raw_features|
+---+--------------------+--------------------+--------------------+
|  0|Hi I am testing s...|[hi, i, am, testi...|(262144,[24417,49...|
|  1|Python is better ...|[python, is, bett...|(262144,[1836,158...|
|  2|Logistic,regressi...|[logistic, regres...|(262144,[13671,76...|
+---+--------------------+--------------------+--------------------+



In [35]:
idf = IDF(inputCol="raw_features", outputCol="features")

In [36]:
idf_model = idf.fit(featurized_data)

In [37]:
rescaled_data = idf_model.transform(featurized_data)

In [38]:
rescaled_data.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|            sentence|               words|        raw_features|            features|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|Hi I am testing s...|[hi, i, am, testi...|(262144,[24417,49...|(262144,[24417,49...|
|  1|Python is better ...|[python, is, bett...|(262144,[1836,158...|(262144,[1836,158...|
|  2|Logistic,regressi...|[logistic, regres...|(262144,[13671,76...|(262144,[13671,76...|
+---+--------------------+--------------------+--------------------+--------------------+

