In [1]:
import pyspark
from pyspark.sql.session import SparkSession
from pyspark.sql.context import SQLContext
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [2]:
spark = SparkSession.builder.appName("practise").getOrCreate()
sqlContext = SQLContext(spark)
#spark



In [3]:
#schema for the data
schema = StructType([StructField("Id", IntegerType(), True),
                     StructField("title", StringType(), True),
                     StructField("genres", StringType(), True)])

In [4]:
#read the dataset
moviesDF = sqlContext.read.csv("G://Mi unidad//Big data//Spark//Advance_spark//movies.csv",header=True, schema=schema)
moviesDF.show()

+---+--------------------+--------------------+
| Id|               title|              genres|
+---+--------------------+--------------------+
|  1|    Toy Story (1995)|Adventure|Animati...|
|  2|      Jumanji (1995)|Adventure|Childre...|
|  3|Grumpier Old Men ...|      Comedy|Romance|
|  4|Waiting to Exhale...|Comedy|Drama|Romance|
|  5|Father of the Bri...|              Comedy|
|  6|         Heat (1995)|Action|Crime|Thri...|
|  7|      Sabrina (1995)|      Comedy|Romance|
|  8| Tom and Huck (1995)|  Adventure|Children|
|  9| Sudden Death (1995)|              Action|
| 10|    GoldenEye (1995)|Action|Adventure|...|
| 11|American Presiden...|Comedy|Drama|Romance|
| 12|Dracula: Dead and...|       Comedy|Horror|
| 13|        Balto (1995)|Adventure|Animati...|
| 14|        Nixon (1995)|               Drama|
| 15|Cutthroat Island ...|Action|Adventure|...|
| 16|       Casino (1995)|         Crime|Drama|
| 17|Sense and Sensibi...|       Drama|Romance|
| 18|   Four Rooms (1995)|              

In [5]:
#checking the schema
moviesDF.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [6]:
moviesDF.columns

['Id', 'title', 'genres']

In [7]:
moviesDF.select(["title","genres"]).show()

+--------------------+--------------------+
|               title|              genres|
+--------------------+--------------------+
|    Toy Story (1995)|Adventure|Animati...|
|      Jumanji (1995)|Adventure|Childre...|
|Grumpier Old Men ...|      Comedy|Romance|
|Waiting to Exhale...|Comedy|Drama|Romance|
|Father of the Bri...|              Comedy|
|         Heat (1995)|Action|Crime|Thri...|
|      Sabrina (1995)|      Comedy|Romance|
| Tom and Huck (1995)|  Adventure|Children|
| Sudden Death (1995)|              Action|
|    GoldenEye (1995)|Action|Adventure|...|
|American Presiden...|Comedy|Drama|Romance|
|Dracula: Dead and...|       Comedy|Horror|
|        Balto (1995)|Adventure|Animati...|
|        Nixon (1995)|               Drama|
|Cutthroat Island ...|Action|Adventure|...|
|       Casino (1995)|         Crime|Drama|
|Sense and Sensibi...|       Drama|Romance|
|   Four Rooms (1995)|              Comedy|
|Ace Ventura: When...|              Comedy|
|  Money Train (1995)|Action|Com

In [8]:
#describe numerical columns, but it also takes string in count
moviesDF.describe().show()

+-------+------------------+--------------------+------------------+
|summary|                Id|               title|            genres|
+-------+------------------+--------------------+------------------+
|  count|              9742|                9742|              9742|
|   mean|42200.353623485935|                null|              null|
| stddev| 52160.49485443825|                null|              null|
|    min|                 1|"11'09""01 - Sept...|(no genres listed)|
|    max|            193609|À nous la liberté...|           Western|
+-------+------------------+--------------------+------------------+



In [9]:
# adding new column 
moviesDF = moviesDF.withColumn("New Col",moviesDF["Id"]*2)
moviesDF.show()

+---+--------------------+--------------------+-------+
| Id|               title|              genres|New Col|
+---+--------------------+--------------------+-------+
|  1|    Toy Story (1995)|Adventure|Animati...|      2|
|  2|      Jumanji (1995)|Adventure|Childre...|      4|
|  3|Grumpier Old Men ...|      Comedy|Romance|      6|
|  4|Waiting to Exhale...|Comedy|Drama|Romance|      8|
|  5|Father of the Bri...|              Comedy|     10|
|  6|         Heat (1995)|Action|Crime|Thri...|     12|
|  7|      Sabrina (1995)|      Comedy|Romance|     14|
|  8| Tom and Huck (1995)|  Adventure|Children|     16|
|  9| Sudden Death (1995)|              Action|     18|
| 10|    GoldenEye (1995)|Action|Adventure|...|     20|
| 11|American Presiden...|Comedy|Drama|Romance|     22|
| 12|Dracula: Dead and...|       Comedy|Horror|     24|
| 13|        Balto (1995)|Adventure|Animati...|     26|
| 14|        Nixon (1995)|               Drama|     28|
| 15|Cutthroat Island ...|Action|Adventure|...| 

In [10]:
#removing columns
moviesDF = moviesDF.drop("New Col")
moviesDF.show()

+---+--------------------+--------------------+
| Id|               title|              genres|
+---+--------------------+--------------------+
|  1|    Toy Story (1995)|Adventure|Animati...|
|  2|      Jumanji (1995)|Adventure|Childre...|
|  3|Grumpier Old Men ...|      Comedy|Romance|
|  4|Waiting to Exhale...|Comedy|Drama|Romance|
|  5|Father of the Bri...|              Comedy|
|  6|         Heat (1995)|Action|Crime|Thri...|
|  7|      Sabrina (1995)|      Comedy|Romance|
|  8| Tom and Huck (1995)|  Adventure|Children|
|  9| Sudden Death (1995)|              Action|
| 10|    GoldenEye (1995)|Action|Adventure|...|
| 11|American Presiden...|Comedy|Drama|Romance|
| 12|Dracula: Dead and...|       Comedy|Horror|
| 13|        Balto (1995)|Adventure|Animati...|
| 14|        Nixon (1995)|               Drama|
| 15|Cutthroat Island ...|Action|Adventure|...|
| 16|       Casino (1995)|         Crime|Drama|
| 17|Sense and Sensibi...|       Drama|Romance|
| 18|   Four Rooms (1995)|              

In [11]:
#rename column
moviesDF=  moviesDF.withColumnRenamed("genres", "Genre")
moviesDF=  moviesDF.withColumnRenamed("title", "Title")
moviesDF.show()

+---+--------------------+--------------------+
| Id|               Title|               Genre|
+---+--------------------+--------------------+
|  1|    Toy Story (1995)|Adventure|Animati...|
|  2|      Jumanji (1995)|Adventure|Childre...|
|  3|Grumpier Old Men ...|      Comedy|Romance|
|  4|Waiting to Exhale...|Comedy|Drama|Romance|
|  5|Father of the Bri...|              Comedy|
|  6|         Heat (1995)|Action|Crime|Thri...|
|  7|      Sabrina (1995)|      Comedy|Romance|
|  8| Tom and Huck (1995)|  Adventure|Children|
|  9| Sudden Death (1995)|              Action|
| 10|    GoldenEye (1995)|Action|Adventure|...|
| 11|American Presiden...|Comedy|Drama|Romance|
| 12|Dracula: Dead and...|       Comedy|Horror|
| 13|        Balto (1995)|Adventure|Animati...|
| 14|        Nixon (1995)|               Drama|
| 15|Cutthroat Island ...|Action|Adventure|...|
| 16|       Casino (1995)|         Crime|Drama|
| 17|Sense and Sensibi...|       Drama|Romance|
| 18|   Four Rooms (1995)|              

In [13]:
"""drop na"""
#moviesDF.na.drop() 
#moviesDF.na.drop(how ="any", subset = ["Genre"]) It only drop if find NA in a particular column

"""Filling missing values"""
#moviesDF.na.fill(moviesDF.agg({"Id":"mean"}),"Id")

'drop na'

45.:!0