In [2]:
!scala -version

Scala code runner version 2.12.10 -- Copyright 2002-2019, LAMP/EPFL and Lightbend, Inc.


In [3]:
from google.cloud import bigquery
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder \
  .appName('1.3. BigQuery Storage &  Spark MLlib - Python')\
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar') \
  .getOrCreate()

spark.version

'2.4.5'

In [5]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

# Retrieve Reddit Data from BigQuery

In [28]:
QUERY = """
SELECT *
FROM `fh-bigquery.reddit_posts.2018_*`
WHERE subreddit = 'technology' AND score >10
"""

In [29]:
spark = SparkSession.builder.appName('Query Results').getOrCreate()
bq = bigquery.Client()

In [30]:
print('Querying BigQuery')
query_job = bq.query(QUERY)
query_job.result()

Querying BigQuery


<google.cloud.bigquery.table.RowIterator at 0x7fc78519d2d0>

In [31]:
df = spark.read.format('bigquery') \
    .option('dataset', query_job.destination.dataset_id) \
    .load(query_job.destination.table_id)

# Remove Special Characters

In [10]:
def ascii_ignore(x):
    return x.encode('ascii', 'ignore').decode('ascii')

ascii_udf = udf(ascii_ignore)

In [32]:
df_titles = df.withColumn("title_no_ascii", ascii_udf('title')) \
.withColumn("title_no_spaces", trim(col("title_no_ascii"))) \
.filter('length(title_no_spaces) > 10') \
.select('title')\
.cache()

In [21]:
df_titles.count()

126742

# Text Prepping

In [35]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

In [39]:
tokenizer = Tokenizer(inputCol="title", outputCol="words")

remover = StopWordsRemover(inputCol="words", outputCol="filtered")

pipeline = Pipeline(stages=[tokenizer, remover])

model = pipeline.fit(df_1)
df_2 = model.transform(df_1)

In [41]:
df_2.show(2,False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title                                                                                                                                              

In [42]:
from pyspark.ml.clustering import LDA

In [None]:
lda = LDA(k=10, maxIter =10)

model = lda.fit(dataset)

In [None]:
categorical_columns= ['sex', 'education_level', 'marital_status', 'pay_0','pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']

numeric_col = ['age', 'bill_amt_1','bill_amt_2','bill_amt_3','bill_amt_4','bill_amt_5','bill_amt_6',
              'pay_amt_1','pay_amt_2','pay_amt_3','pay_amt_4','pay_amt_5','pay_amt_6']


# The index of string vlaues multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns
]

# The encode of indexed vlaues multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]


# Scaling numerical variables
scaled_numeric_col = [col + "_scaled" for col in numeric_col]
numeric_assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in numeric_col]
scalers = [StandardScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in numeric_col]

# Vectorizing encoded values
assembler = VectorAssembler(inputCols=scaled_numeric_col + [encoder.getOutputCol() for encoder in encoders],outputCol="features")

pipeline = Pipeline(stages=
                        indexers+ 
                        encoders+
                        numeric_assemblers+
                        scalers+
                        [assembler])
model=pipeline.fit(df_cc_data2)
df_assembler_output = model.transform(df_cc_data2)

df_assembler_output