In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, split, explode
from pyspark.sql.types import StringType, DateType, TimestampType, ShortType
from pyspark.sql.types import IntegerType, LongType, StructField, StructType

In [3]:
spark = SparkSession.builder.appName("mainApp").getOrCreate()
sc = spark.sparkContext

In [4]:
artists_schema = StructType([
    StructField('id', LongType(), False),
    StructField('name', StringType(), True),
])

chart_artist_mapping_schema = StructType([
    StructField('id', LongType(), False),
    StructField('artistId', LongType(), True),
])

charts_schema = StructType([
    StructField('id', LongType(), False),
    StructField('title', StringType(), True),
    StructField('position', ShortType(), True),
    StructField('date', DateType(), True),
    StructField('countryId', LongType(), True),
    StructField('chartName', StringType(), True),
    StructField('movement', StringType(), True),
    StructField('streams', LongType(), True),
])

regions_schema = StructType([
    StructField('id', LongType(), False),
    StructField('countryName', StringType(), True),
])

it = [
    ('artists', artists_schema),
    ('chart_artist_mapping', chart_artist_mapping_schema),
    ('charts', charts_schema),
    ('regions', regions_schema),
]

for stem, schema in it:
    print(stem)
    
    df = spark.read.csv(
        'files/{}.csv'.format(stem),
        sep=',', header='false', schema=schema
    )
    df.write.save('files/{}.parquet'.format(stem))
    
    spark.read.parquet('files/{}.parquet'.format(stem)).show(2)
    print('=' * 40)

artists
+---+-----------------+
| id|             name|
+---+-----------------+
|  1|             null|
|  2|อุ๋ย Buddha Bless|
+---+-----------------+
only showing top 2 rows

chart_artist_mapping
+------+--------+
|    id|artistId|
+------+--------+
| 15117|       1|
|186929|       2|
+------+--------+
only showing top 2 rows

charts
+-----+---------------+--------+----------+---------+---------+---------+-------+
|   id|          title|position|      date|countryId|chartName| movement|streams|
+-----+---------------+--------+----------+---------+---------+---------+-------+
|60788|Grab the Moment|      37|2017-05-10|       44|   top200|  MOVE_UP|  49783|
|60788|Grab the Moment|      41|2017-06-11|       44|   top200|MOVE_DOWN|  38374|
+-----+---------------+--------+----------+---------+---------+---------+-------+
only showing top 2 rows

regions
+---+-----------+
| id|countryName|
+---+-----------+
|  1|    Andorra|
|  2|  Argentina|
+---+-----------+
only showing top 2 rows



In [None]:
spark.stop()