In [1]:
import findspark
findspark.init()

import pyspark
import random
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

In [2]:
conf = SparkConf().setAppName("Vectorizer").setMaster("spark://10.102.2.122:7077")
sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

print("Spark Version: " + sc.version)
print("PySpark Version: " + pyspark.__version__)

Spark Version: 2.3.2
PySpark Version: 2.3.2


In [12]:
from pyspark.ml.feature import CountVectorizer

# Input data: Each row is a bag of words with a ID.
df = sqlContext.createDataFrame([
    (0, "this is a test".split(" ")),
    (1, "this is another test two".split(" "))
], ["id", "words"])

In [13]:
df.show(5)

+---+--------------------+
| id|               words|
+---+--------------------+
|  0| [this, is, a, test]|
|  1|[this, is, anothe...|
+---+--------------------+



In [18]:
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="bag_of_words_vector", vocabSize=5, minDF=1.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+---+------------------------------+-------------------------------+
|id |words                         |bag_of_words_vector            |
+---+------------------------------+-------------------------------+
|0  |[this, is, a, test]           |(5,[0,1,2,3],[1.0,1.0,1.0,1.0])|
|1  |[this, is, another, test, two]|(5,[0,1,2,4],[1.0,1.0,1.0,1.0])|
+---+------------------------------+-------------------------------+



In [19]:
result.show()

+---+--------------------+--------------------+
| id|               words| bag_of_words_vector|
+---+--------------------+--------------------+
|  0| [this, is, a, test]|(5,[0,1,2,3],[1.0...|
|  1|[this, is, anothe...|(5,[0,1,2,4],[1.0...|
+---+--------------------+--------------------+



In [20]:
model.vocabulary

['is', 'this', 'test', 'a', 'two']

In [21]:
result.toPandas()  

Unnamed: 0,id,words,bag_of_words_vector
0,0,"[this, is, a, test]","(1.0, 1.0, 1.0, 1.0, 0.0)"
1,1,"[this, is, another, test, two]","(1.0, 1.0, 1.0, 0.0, 1.0)"


In [53]:
import numpy as np
X = np.array(result.select('bag_of_words_vector').collect()).ravel()#convert to 1d array

In [61]:
np.savetxt('C:/MYLOCALFILES/JUPYTER_NOTEBOOKS/BASIC_NBS/PY_SPARK/X.out', X, delimiter=',')