In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
spark = SparkSession \
    .builder \
    .master('spark://master:7077') \
    .appName("Test Spark Session") \
    .getOrCreate()

spark.newSession()

In [3]:
# conf = (SparkConf()
#          .setMaster("spark://master:7077")
#          .setAppName("Test Directory List")
#          .set("spark.executor.memory", "1g"))
# sc = SparkContext(conf = conf)

In [4]:
# /opt/data/song_data/
# └── A
#     ├── A
#     │   ├── A
#     │   │   ├── TRAAAAK128F9318786.json
#     │   │   ├── TRAAAAV128F421A322.json
#     │   │   ├── TRAAABD128F429CF47.json
#     │   │   ├── TRAAACN128F9355673.json

schema = StructType([
    StructField("artist_id", StringType(), True),
    StructField("artist_name", StringType(), True),
    StructField("song_id", StringType(), True),
    StructField("title", StringType(), True),
])

df = spark.read.json('/opt/data/song_data/A/A/*/*.json', schema)
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)



In [5]:
df.toPandas()

Unnamed: 0,artist_id,artist_name,song_id,title
0,ARSUVLW12454A4C8B8,Royal Philharmonic Orchestra/Sir Thomas Beecham,SOBTCUI12A8AE48B70,Faust: Ballet Music (1959 Digital Remaster): V...
1,ARXQC081187FB4AD42,William Shatner_ David Itkin_ The Arkansas Sym...,SOXRPUH12AB017F769,Exodus: Part I: Moses and Pharaoh
2,ARWUNH81187FB4A3E0,Trick Daddy,SOVNKJI12A8C13CB0D,Take It To Da House (Featuring The Slip N' Sli...
3,ARTC1LV1187B9A4858,The Bonzo Dog Band,SOAFBCP12A8C13CC7D,King Of Scurf (2007 Digital Remaster)
4,ARA23XO1187B9AF18F,The Smithereens,SOKTJDS12AF72A25E5,Drown In My Own Tears (24-Bit Digitally Remast...
5,ARLRWBW1242077EB29,Mikhail Pletnev,SOYVBGZ12A6D4F92A8,Piano Sonata No. 21 in C 'Waldstein' Op. 53: I...
6,ARV3PXE1187B98E680,John Brown's Body,SOQAUGD12A58A7A92D,The Gold (Dubmatix Runnin' Remix) [Bonus Track]
7,AR5LZJD1187FB4C5E5,Britt Nicole,SOGXFIF12A58A78CC4,Hanging On (Medium Key Performance Track Witho...
8,AR6PJ8R1187FB5AD70,Shakira Featuring Wyclef Jean,SOEHWGF12A6D4F8B2B,Hips Don't Lie (featuring Wyclef Jean)
9,ARWDPT81187B99C656,The Kooks,SOYQDUJ12A8C13F773,Shine On (Acoustic Version From Q101_ Chicago)


In [7]:
# workspace/data/log_data/
# └── 2018
#     └── 11
#         ├── 2018-11-01-events.json
#

df = spark.read.json('/opt/data/log_data/*/*/*.json')
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [8]:
df.toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.541017e+12,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.541017e+12,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,Train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.541017e+12,583,Marry Me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1.540872e+12,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.541060e+12,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12
5,Sony Wonder,Logged In,Samuel,M,0,Gonzalez,218.06975,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.540493e+12,597,Blackbird,200,1542253449796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",61
6,,Logged In,Samuel,M,1,Gonzalez,,free,"Houston-The Woodlands-Sugar Land, TX",GET,About,1.540493e+12,597,,200,1542253460796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",61
7,,Logged Out,,,0,,,paid,,PUT,Login,,602,,307,1542260074796,,
8,,Logged In,Tegan,F,1,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.540794e+12,602,,200,1542260277796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",80
9,Van Halen,Logged In,Tegan,F,2,Levine,289.38404,paid,"Portland-South Portland, ME",PUT,NextSong,1.540794e+12,602,Best Of Both Worlds (Remastered Album Version),200,1542260935796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",80


In [10]:
spark.stop()