# Players Example

In [24]:
raw_players = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:29092") \
  .option("subscribe","players") \
  .option("startingOffsets", "earliest") \
  .option("endingOffsets", "latest") \
  .load() 

In [25]:
raw_players.cache()

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [26]:
raw_players.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [27]:
players = raw_players.select(raw_players.value.cast('string'))
players.show(4)

+--------------------+
|               value|
+--------------------+
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
+--------------------+
only showing top 4 rows



In [28]:
players.write.parquet("/tmp/players")

In [29]:
# Read from parquet
read_players = spark.read.parquet('/tmp/players')
read_players.show()

+--------------------+
|               value|
+--------------------+
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
|{"Competition":"W...|
+--------------------+
only showing top 20 rows



In [42]:
# Transform to DataFrame
from pyspark.sql.functions import from_json, col
json_schema = spark.read.json(read_players.rdd.map(lambda row: row.value)).schema
read_players = read_players.withColumn('json', from_json(col('value'), json_schema))
read_players.select(read_players.json.Team, read_players.json.FullName).show()

+---------+--------------------+
|json.Team|       json.FullName|
+---------+--------------------+
|Argentina|        Ãngel Bossio|
|Argentina|        Juan Botasso|
|Argentina|      Roberto Cherro|
|Argentina|   Alberto Chividini|
|Argentina|                    |
|Argentina|                    |
|Argentina|       Juan Evaristo|
|Argentina|      Mario Evaristo|
|Argentina|     Manuel Ferreira|
|Argentina|          Luis Monti|
|Argentina|                    |
|Argentina|   Rodolfo Orlandini|
|Argentina|Fernando Paternoster|
|Argentina|   Natalio Perinetti|
|Argentina|     Carlos Peucelle|
|Argentina|     Edmundo Piaggio|
|Argentina|  Alejandro Scopelli|
|Argentina|      Carlos Spadaro|
|Argentina|                    |
|Argentina|                    |
+---------+--------------------+
only showing top 20 rows



# Commits Example

In [1]:
import json
import pandas as pd
from pyspark.sql.functions import explode, split
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
import warnings

In [2]:
raw_commits = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:29092") \
  .option("subscribe","commits") \
  .option("startingOffsets", "earliest") \
  .option("endingOffsets", "latest") \
  .load() 

In [3]:
raw_commits.cache()

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [4]:
commits = raw_commits.select(raw_commits.value.cast('string'))

In [5]:
commits.show()

+--------------------+
|               value|
+--------------------+
|{"sha":"bd34b8dd2...|
|{"sha":"61edf3fa9...|
|{"sha":"8eff744ee...|
|{"sha":"b4742c125...|
|{"sha":"9a4576e75...|
|{"sha":"b5560d842...|
|{"sha":"79ece3598...|
|{"sha":"f06deb828...|
|{"sha":"0c9eaceda...|
|{"sha":"476b36770...|
|{"sha":"239278fd3...|
|{"sha":"98b36e74b...|
|{"sha":"c3cbbdd8a...|
|{"sha":"18879fb99...|
|{"sha":"bdaddcf10...|
|{"sha":"7b81a836c...|
|{"sha":"c538237f4...|
|{"sha":"4a6241be0...|
|{"sha":"1900c7bca...|
|{"sha":"578d53623...|
+--------------------+
only showing top 20 rows



In [6]:
commits.collect()[0]

Row(value='{"sha":"bd34b8dd2e441440928bfd51bfbfa5facb424557","commit":{"author":{"name":"Paul Chvostek","email":"paul+gitkraken@it.ca","date":"2018-01-04T19:35:46Z"},"committer":{"name":"Nico Williams","email":"nico@cryptonector.com","date":"2018-01-04T21:50:20Z"},"message":"fix FreeBSD install instructions","tree":{"sha":"9df965dfa5f4a92a4aaca3fb14586ab4813c9bf7","url":"https://api.github.com/repos/stedolan/jq/git/trees/9df965dfa5f4a92a4aaca3fb14586ab4813c9bf7"},"url":"https://api.github.com/repos/stedolan/jq/git/commits/bd34b8dd2e441440928bfd51bfbfa5facb424557","comment_count":0,"verification":{"verified":false,"reason":"unsigned","signature":null,"payload":null}},"url":"https://api.github.com/repos/stedolan/jq/commits/bd34b8dd2e441440928bfd51bfbfa5facb424557","html_url":"https://github.com/stedolan/jq/commit/bd34b8dd2e441440928bfd51bfbfa5facb424557","comments_url":"https://api.github.com/repos/stedolan/jq/commits/bd34b8dd2e441440928bfd51bfbfa5facb424557/comments","author":null,"comm

## Spark Infered Schema

In [7]:
extracted_commits = commits.rdd.map(lambda x: json.loads(x.value)).toDF()



In [8]:
extracted_commits.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              author|        comments_url|              commit|           committer|            html_url|             parents|                 sha|                 url|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                null|https://api.githu...|Map(author -> Map...|Map(gists_url -> ...|https://github.co...|[Map(sha -> 61edf...|bd34b8dd2e4414409...|https://api.githu...|
|Map(gists_url -> ...|https://api.githu...|Map(author -> Map...|Map(gists_url -> ...|https://github.co...|[Map(sha -> 8eff7...|61edf3fa93f6177ef...|https://api.githu...|
|Map(gists_url -> ...|https://api.githu...|Map(author -> Map...|Map(gists_url -> ...|https://github.co...|[Map(sha -> b4742...|8eff744eecb9ab2f4...|ht

In [10]:
extracted_commits.registerTempTable('commits')

In [11]:
spark.sql("select sha, commit.message, commit.author.name from commits").show()

+--------------------+-------+----------------+
|                 sha|message|            name|
+--------------------+-------+----------------+
|bd34b8dd2e4414409...|   null|   Paul Chvostek|
|61edf3fa93f6177ef...|   null|     Larry Aasen|
|8eff744eecb9ab2f4...|   null|Nicolas Williams|
|b4742c12570481786...|   null|    David Fetter|
|9a4576e7567dd38b9...|   null|Nicolas Williams|
|b5560d8420d330c4f...|   null|Nicolas Williams|
|79ece359819cdd7d0...|   null|William Langford|
|f06deb828a318536b...|   null|William Langford|
|0c9eacedaae1e0d53...|   null|William Langford|
|476b36770d9337381...|   null| Eric Bréchemier|
|239278fd3a02dc1ae...|   null|    Andrew Speed|
|98b36e74b8174da66...|   null|William Langford|
|c3cbbdd8a201aafdc...|   null|William Langford|
|18879fb99367924cd...|   null|Nicolas Williams|
|bdaddcf10730e2a26...|   null|         trantor|
|7b81a836c31500e68...|   null|William Langford|
|c538237f4e4c381d3...|   null|Nicolas Williams|
|4a6241be0697bbe4e...|   null|Nicolas Wi

In [10]:
extracted_commits.printSchema()

root
 |-- author: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- comments_url: string (nullable = true)
 |-- commit: map (nullable = true)
 |    |-- key: string
 |    |-- value: map (valueContainsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)
 |-- committer: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- html_url: string (nullable = true)
 |-- parents: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)
 |-- sha: string (nullable = true)
 |-- url: string (nullable = true)



# Force the Schema

In [12]:
final_schema = StructType([StructField('sha', StringType(), True),
                     StructField('url', StringType(), True),
                     StructField('html_url', StringType(), True),
                     StructField('comments_url', StringType(), True),
                     StructField('commit', StructType([
                         StructField('message', StringType(), True),
                         StructField('author', StructType([
                             StructField('name', StringType(), True),
                             StructField('email', StringType(), True),
                             StructField('date', StringType(), True)
                         ]))]))])

In [13]:
focused_extracted_commits = commits.rdd.map(lambda x: json.loads(x.value)).toDF(schema=final_schema)

In [14]:
focused_extracted_commits.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 sha|                 url|            html_url|        comments_url|              commit|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|bd34b8dd2e4414409...|https://api.githu...|https://github.co...|https://api.githu...|[fix FreeBSD inst...|
|61edf3fa93f6177ef...|https://api.githu...|https://github.co...|https://api.githu...|[Updated the comp...|
|8eff744eecb9ab2f4...|https://api.githu...|https://github.co...|https://api.githu...|[Update AUTHORS,[...|
|b4742c12570481786...|https://api.githu...|https://github.co...|https://api.githu...|[Added rawfile

I...|
|9a4576e7567dd38b9...|https://api.githu...|https://github.co...|https://api.githu...|[Revert "reduce: ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [20]:
focused_extracted_commits.take(1)[0]['commit']['author']['name']

'Paul Chvostek'

In [21]:
focused_extracted_commits.registerTempTable('focused_commits')

In [22]:
focused_extracted_commits.printSchema()

root
 |-- sha: string (nullable = true)
 |-- url: string (nullable = true)
 |-- html_url: string (nullable = true)
 |-- comments_url: string (nullable = true)
 |-- commit: struct (nullable = true)
 |    |-- message: string (nullable = true)
 |    |-- author: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- date: string (nullable = true)



In [23]:
spark.sql("select sha, commit.message, commit.author.name from focused_commits").show()

+--------------------+--------------------+----------------+
|                 sha|             message|            name|
+--------------------+--------------------+----------------+
|bd34b8dd2e4414409...|fix FreeBSD insta...|   Paul Chvostek|
|61edf3fa93f6177ef...|Updated the compi...|     Larry Aasen|
|8eff744eecb9ab2f4...|      Update AUTHORS|Nicolas Williams|
|b4742c12570481786...|Added rawfile

In...|    David Fetter|
|9a4576e7567dd38b9...|Revert "reduce: h...|Nicolas Williams|
|b5560d8420d330c4f...|Fix import as $da...|Nicolas Williams|
|79ece359819cdd7d0...|Fix hang for slur...|William Langford|
|f06deb828a318536b...|           Fix #1534|William Langford|
|0c9eacedaae1e0d53...|Actually fix the ...|William Langford|
|476b36770d9337381...|Keep object keys ...| Eric Bréchemier|
|239278fd3a02dc1ae...|Use rvm to instal...|    Andrew Speed|
|98b36e74b8174da66...|Fix strptime test...|William Langford|
|c3cbbdd8a201aafdc...|Fix memory leaks ...|William Langford|
|18879fb99367924cd...|Fi