In [2]:
import pyspark
from pyspark import SparkContext, SparkConf
import numpy as np

In [3]:
from pyspark.sql import SQLContext, HiveContext
from pyspark.sql import functions as fn

In [7]:
conf = SparkConf().setAppName('hello_world').setMaster('local[*]')

In [8]:
sc = pyspark.SparkContext(conf=conf)
# sqlContext = HiveContext(sc)

In [10]:
documents_rdd = sc.parallelize([
        [1, 'cats are cute', 0],
        [2, 'dogs are playfull', 0],
        [3, 'lions are big', 1],
        [4, 'cars are fast', 1]])
users_rdd = sc.parallelize([
        [0, 'Alice', 20],
        [1, 'Bob', 23],
        [2, 'Charles', 32]])

In [11]:
df_d = documents_rdd.toDF(['doc_id', 'text', 'user_id'])
df_u = users_rdd.toDF(['user_id', 'name', 'age'])

In [12]:
df_d.printSchema()

root
 |-- doc_id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: long (nullable = true)



In [16]:
user_age_df = df_u.select(fn.avg('age'))
user_age_df.show()

+--------+
|avg(age)|
+--------+
|    25.0|
+--------+



In [18]:
df_all = df_u.join(df_d, on='user_id', how='left')
df_all.show()

+-------+-------+---+------+-----------------+
|user_id|   name|age|doc_id|             text|
+-------+-------+---+------+-----------------+
|      0|  Alice| 20|     1|    cats are cute|
|      0|  Alice| 20|     2|dogs are playfull|
|      1|    Bob| 23|     3|    lions are big|
|      1|    Bob| 23|     4|    cars are fast|
|      2|Charles| 32|  null|             null|
+-------+-------+---+------+-----------------+



In [21]:
df_all.groupby('name').agg(fn.count('text')).show()

+-------+-----------+
|   name|count(text)|
+-------+-----------+
|Charles|          0|
|    Bob|          2|
|  Alice|          2|
+-------+-----------+



In [22]:
df_u.withColumn('name_length', fn.length('name')).show()

+-------+-------+---+-----------+
|user_id|   name|age|name_length|
+-------+-------+---+-----------+
|      0|  Alice| 20|          5|
|      1|    Bob| 23|          3|
|      2|Charles| 32|          7|
+-------+-------+---+-----------+



# Tokenize

In [23]:
from pyspark.ml.feature import Tokenizer

In [30]:
tk = Tokenizer().setInputCol('text').setOutputCol('word_tokens')

In [43]:
tk.transform(df_d).show(truncate=False)

+------+-----------------+-------+---------------------+
|doc_id|text             |user_id|word_tokens          |
+------+-----------------+-------+---------------------+
|1     |cats are cute    |0      |[cats, are, cute]    |
|2     |dogs are playfull|0      |[dogs, are, playfull]|
|3     |lions are big    |1      |[lions, are, big]    |
|4     |cars are fast    |1      |[cars, are, fast]    |
+------+-----------------+-------+---------------------+



In [32]:
from pyspark.ml.feature import CountVectorizer

In [33]:
cv = CountVectorizer().setInputCol('word_tokens').setOutputCol('features')

In [42]:
cv.fit(tk.transform(df_d)).transform(tk.transform(df_d)).show(truncate=False)

+------+-----------------+-------+---------------------+-------------------------+
|doc_id|text             |user_id|word_tokens          |features                 |
+------+-----------------+-------+---------------------+-------------------------+
|1     |cats are cute    |0      |[cats, are, cute]    |(9,[0,5,8],[1.0,1.0,1.0])|
|2     |dogs are playfull|0      |[dogs, are, playfull]|(9,[0,2,7],[1.0,1.0,1.0])|
|3     |lions are big    |1      |[lions, are, big]    |(9,[0,3,6],[1.0,1.0,1.0])|
|4     |cars are fast    |1      |[cars, are, fast]    |(9,[0,1,4],[1.0,1.0,1.0])|
+------+-----------------+-------+---------------------+-------------------------+



In [44]:
cv.fit(tk.transform(df_d)).vocabulary

['are', 'fast', 'playfull', 'big', 'cars', 'cute', 'lions', 'dogs', 'cats']

# Pipline

In [48]:
from pyspark.ml import Pipeline


pipeline_cv = Pipeline(stages=[tk, cv])
pipeline_cv_transformer = pipeline_cv.fit(df_d)
pipeline_cv_transformer.transform(df_d).show(truncate=False)

+------+-----------------+-------+---------------------+-------------------------+
|doc_id|text             |user_id|word_tokens          |features                 |
+------+-----------------+-------+---------------------+-------------------------+
|1     |cats are cute    |0      |[cats, are, cute]    |(9,[0,1,3],[1.0,1.0,1.0])|
|2     |dogs are playfull|0      |[dogs, are, playfull]|(9,[0,2,5],[1.0,1.0,1.0])|
|3     |lions are big    |1      |[lions, are, big]    |(9,[0,4,8],[1.0,1.0,1.0])|
|4     |cars are fast    |1      |[cars, are, fast]    |(9,[0,6,7],[1.0,1.0,1.0])|
+------+-----------------+-------+---------------------+-------------------------+

