In [16]:

from pyspark.sql import SparkSession
from pyspark.sql.types import *

def readdata(spark):
  # Create a dataframe manually, from a list
  df = spark.createDataFrame([(1,'Nguyen'),(2,'Dinh'),(3,'Vinh')])
  print(type(df))
  df.show()
  df.printSchema()
  
  # Loading DataFrames: CSV method 1 using spark.read.csv('path')
  player_df =spark.read.option('header', True).csv('gs://dataproc-staging-us-central1-558853654924-mop7md33/notebooks/jupyter/player.csv')
  player_df.show(5, False)
  player_df.printSchema()
  
  # Loading DataFrames: CSV method 2 using spark.read.format('csv').load('path')
  player_df = spark.read.format("csv").load('gs://dataproc-staging-us-central1-558853654924-mop7md33/notebooks/jupyter/player.csv')
  player_df.show(5, False)
  player_df.printSchema()

  # Loading JSON
  people_jsondf = spark.read.json('gs://dataproc-staging-us-central1-558853654924-mop7md33/notebooks/jupyter/people.json')
  people_jsondf.printSchema()
  people_jsondf.show(5)
  
  # In the same way we can upload other formats such as parquet etc 
  
  # Infer Schema
  player_headersdF = spark.read.option("inferSchema", "true").option('header', True).csv('gs://dataproc-staging-us-central1-558853654924-mop7md33/notebooks/jupyter/player.csv')
  player_headersdF.printSchema()
  player_headersdF.show(5)


  # Best practice it is explicitly define the schema
  playerSchema = \
    StructType([
                StructField("id", IntegerType()),
                StructField("player_api_id", IntegerType()),
                StructField("player_name", StringType()),
                StructField("player_fifa_api_id", IntegerType()),
                StructField("birthday", TimestampType()),              
                StructField("height", FloatType()),
                StructField("weight", FloatType())
              ])

  player_schemadf = spark.read.schema(playerSchema).csv('gs://dataproc-staging-us-central1-558853654924-mop7md33/notebooks/jupyter/player_attributes.csv')
  player_schemadf.printSchema()
  player_schemadf.schema
  player_schemadf.dtypes
  player_schemadf.columns



if __name__ == "__main__":
  spark = SparkSession.builder.appName("Simple with Session").getOrCreate()
  readdata(spark)
  
  spark.stop()

<class 'pyspark.sql.dataframe.DataFrame'>
+---+-----+
| _1|   _2|
+---+-----+
|  1| Andy|
|  2|mandy|
|  3|sandy|
+---+-----+

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)

+---+-------------+------------------+------------------+-------------------+------+------+
|id |player_api_id|player_name       |player_fifa_api_id|birthday           |height|weight|
+---+-------------+------------------+------------------+-------------------+------+------+
|1  |505942       |Aaron Appindangoye|218353            |1992-02-29 00:00:00|182.88|187   |
|2  |155782       |Aaron Cresswell   |189615            |1989-12-15 00:00:00|170.18|146   |
|3  |162549       |Aaron Doran       |186170            |1991-05-13 00:00:00|170.18|163   |
|4  |30572        |Aaron Galindo     |140161            |1982-05-08 00:00:00|182.88|198   |
|5  |23780        |Aaron Hughes      |17725             |1979-11-08 00:00:00|182.88|154   |
+---+-------------+------------------+------------------+--------