In [1]:
# A simple demo for working with SparkSQL and Tweets
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json

# Create a Hive context
hiveCtx = HiveContext(sc)
# Read JSON file
input = hiveCtx.read.json("tweet.json")
# Register as table
input.registerTempTable("tweets")

# SQL query
topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
print(topTweets.collect())

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
39,application_1499674741924_0043,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
[Row(text=u'RT @PostGradProblem: In preparation for the NFL lockout, I will be spending twice as much time analyzing my fantasy baseball team during ...', retweetCount=41)]

In [2]:
# Accessing the text column in the topTweets SchemaRDD in Python
topTweetText = topTweets.rdd.map(lambda row : row.text)
print(topTweetText.collect())

[u'RT @PostGradProblem: In preparation for the NFL lockout, I will be spending twice as much time analyzing my fantasy baseball team during ...']

In [3]:
# Make a UDF to tell us how long a tweet text is
# Define custom function
hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
# SQL query with custom function
lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython(text) FROM tweets LIMIT 10")
print(lengthSchemaRDD.collect())

[Row(strLenPython(text)=140)]