In [11]:
from pyspark.sql import *

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example2") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [15]:
hero = spark.read.csv('/home/hasan/DATA SET/superhero-set/heroes_information.csv', header=True)


In [16]:
hero.show(5)

+---+-----------+------+---------+-----------------+----------+------+-----------------+----------+---------+------+
|_c0|       name|Gender|Eye color|             Race|Hair color|Height|        Publisher|Skin color|Alignment|Weight|
+---+-----------+------+---------+-----------------+----------+------+-----------------+----------+---------+------+
|  0|     A-Bomb|  Male|   yellow|            Human|   No Hair| 203.0|    Marvel Comics|         -|     good| 441.0|
|  1| Abe Sapien|  Male|     blue|    Icthyo Sapien|   No Hair| 191.0|Dark Horse Comics|      blue|     good|  65.0|
|  2|   Abin Sur|  Male|     blue|          Ungaran|   No Hair| 185.0|        DC Comics|       red|     good|  90.0|
|  3|Abomination|  Male|    green|Human / Radiation|   No Hair| 203.0|    Marvel Comics|         -|      bad| 441.0|
|  4|    Abraxas|  Male|     blue|    Cosmic Entity|     Black| -99.0|    Marvel Comics|         -|      bad| -99.0|
+---+-----------+------+---------+-----------------+----------+-

In [19]:
#column of the dataset
hero.columns

['_c0',
 'name',
 'Gender',
 'Eye color',
 'Race',
 'Hair color',
 'Height',
 'Publisher',
 'Skin color',
 'Alignment',
 'Weight']

In [20]:
#summary of the dataset
hero.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Eye color: string (nullable = true)
 |-- Race: string (nullable = true)
 |-- Hair color: string (nullable = true)
 |-- Height: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Skin color: string (nullable = true)
 |-- Alignment: string (nullable = true)
 |-- Weight: string (nullable = true)



In [22]:
#filtering male hero
hero.filter(hero.Gender=='Male').count()

505

In [23]:
#filtering female hero
hero.filter(hero.Gender=='Female').count()

200

### Grouping

In [24]:
#grouping with Race
race_group = hero.groupby('Race')\
            .count()\
            .show()


+------------------+-----+
|              Race|count|
+------------------+-----+
|          Neyaphem|    1|
|          Symbiote|    9|
|        Kryptonian|    7|
|           Ungaran|    1|
|           Martian|    1|
|         Strontian|    1|
| Human / Radiation|   11|
|   Kakarantharaian|    1|
|         Metahuman|    2|
|Dathomirian Zabrak|    1|
|    Human / Cosmic|    2|
|   Xenomorph XX121|    1|
|            Animal|    4|
|       Frost Giant|    2|
|            Cyborg|   11|
|           Inhuman|    4|
|     God / Eternal|   14|
|    Yoda's species|    1|
|     Icthyo Sapien|    1|
|            Rodian|    1|
+------------------+-----+
only showing top 20 rows



In [26]:
#grouping with Skin_color
race_group = hero.groupby('Skin color')\
            .count()\
            .show()


+--------------+-----+
|    Skin color|count|
+--------------+-----+
|        orange|    1|
|    blue-white|    1|
|          grey|    5|
|         green|   21|
|        yellow|    2|
|orange / white|    1|
|        silver|    5|
|        purple|    3|
|         white|    7|
|          gray|    1|
|   red / black|    1|
|          pink|    2|
|           red|    9|
|          gold|    3|
|             -|  662|
|         black|    1|
|          blue|    9|
+--------------+-----+



### Sorting

In [42]:
#sorting with weight column descending order
hero_sort = hero.sort((hero.Weight).desc())\
            .show(5)


+---+--------------+------+---------+-----+-------------+------+-------------+----------+---------+------+
|_c0|          name|Gender|Eye color| Race|   Hair color|Height|    Publisher|Skin color|Alignment|Weight|
+---+--------------+------+---------+-----+-------------+------+-------------+----------+---------+------+
|137|Brother Voodoo|  Male|    brown|Human|Brown / White| 183.0|Marvel Comics|         -|     good|  99.0|
|187|   Cottonmouth|  Male|    brown|Human|        Black| 183.0|Marvel Comics|         -|      bad|  99.0|
|257|      Firelord|     -|    white|    -|       Yellow| 193.0|Marvel Comics|         -|     good|  99.0|
|279|   Ghost Rider|  Male|      red|Demon|      No Hair| 188.0|Marvel Comics|         -|     good|  99.0|
|394|     Kraven II|  Male|    brown|Human|        Black| 191.0|Marvel Comics|         -|      bad|  99.0|
+---+--------------+------+---------+-----+-------------+------+-------------+----------+---------+------+
only showing top 5 rows



In [43]:
# total number of DC Comics
hero_sort = hero.sort(hero.Publisher=='DC Comics')
hero_sort.count()

734

In [44]:
# total number of Marvel Comics
hero_sort = hero.sort(hero.Publisher=='Marvel Comics')
hero_sort.count()

734

##### Grouping

In [46]:
# grouping using Publisher
hero_groupby = hero.groupby('Publisher')\
                .count()\
                .show()

+-----------------+-----+
|        Publisher|count|
+-----------------+-----+
|        Rebellion|    1|
|    HarperCollins|    6|
| J. R. R. Tolkien|    1|
|        Star Trek|    6|
|    Marvel Comics|  388|
|        Wildstorm|    3|
|             null|   15|
|       South Park|    1|
|    Sony Pictures|    2|
|      Titan Books|    1|
|      ABC Studios|    4|
|             SyFy|    5|
|     Image Comics|   14|
|Universal Studios|    1|
|   IDW Publishing|    4|
|     NBC - Heroes|   19|
|    Hanna-Barbera|    1|
|        DC Comics|  215|
|        Microsoft|    1|
|    J. K. Rowling|    1|
+-----------------+-----+
only showing top 20 rows



### SQL

In [60]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)

In [61]:
# creating table
hero.registerTempTable('superhero_table')


In [62]:
sqlContext.sql('select * from superhero_table').show(5)


+---+-----------+------+---------+-----------------+----------+------+-----------------+----------+---------+------+
|_c0|       name|Gender|Eye color|             Race|Hair color|Height|        Publisher|Skin color|Alignment|Weight|
+---+-----------+------+---------+-----------------+----------+------+-----------------+----------+---------+------+
|  0|     A-Bomb|  Male|   yellow|            Human|   No Hair| 203.0|    Marvel Comics|         -|     good| 441.0|
|  1| Abe Sapien|  Male|     blue|    Icthyo Sapien|   No Hair| 191.0|Dark Horse Comics|      blue|     good|  65.0|
|  2|   Abin Sur|  Male|     blue|          Ungaran|   No Hair| 185.0|        DC Comics|       red|     good|  90.0|
|  3|Abomination|  Male|    green|Human / Radiation|   No Hair| 203.0|    Marvel Comics|         -|      bad| 441.0|
|  4|    Abraxas|  Male|     blue|    Cosmic Entity|     Black| -99.0|    Marvel Comics|         -|      bad| -99.0|
+---+-----------+------+---------+-----------------+----------+-

In [63]:
#selecting single column
sqlContext.sql('select Race from superhero_table').show(5)

+-----------------+
|             Race|
+-----------------+
|            Human|
|    Icthyo Sapien|
|          Ungaran|
|Human / Radiation|
|    Cosmic Entity|
+-----------------+
only showing top 5 rows



In [64]:
#selecting multiple column
sqlContext.sql('select Race, Height from superhero_table').show(5)

+-----------------+------+
|             Race|Height|
+-----------------+------+
|            Human| 203.0|
|    Icthyo Sapien| 191.0|
|          Ungaran| 185.0|
|Human / Radiation| 203.0|
|    Cosmic Entity| -99.0|
+-----------------+------+
only showing top 5 rows



In [83]:
# total unique in Race column
sqlContext.sql("select distinct(Race) from superhero_table").count()


62

In [84]:
# maximum weight
sqlContext.sql("select max(weight) from superhero_table").show()

+-----------+
|max(weight)|
+-----------+
|       99.0|
+-----------+

