In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)


### Making dataframe from RDD

#### Method 1

In [16]:
from pyspark.sql import Row

# Create a list of tuples. Each tuple contains name of a person with age.
l = [('Ankit',25),('Jalfaizy',22),('saurabh',20),('Bala',26)]

# Create a RDD from the list above.
rdd = sc.parallelize(l)

# Convert each tuple to a row.
people = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))

# Create a DataFrame by applying createDataFrame on RDD with the help of sqlContext.
schemaPeople = sqlContext.createDataFrame(people)
schemaPeople.show()


+--------+---+
|    name|age|
+--------+---+
|   Ankit| 25|
|Jalfaizy| 22|
| saurabh| 20|
|    Bala| 26|
+--------+---+



#### Method 2

In [17]:
from pyspark.sql import Row

rdd = sc.parallelize([Row(a=1,b=2,c=3),Row(a=4,b=5,c=6),Row(a=7,b=8,c=9)])

schemaPeople = people.toDF()
schemaPeople.show()


+--------+---+
|    name|age|
+--------+---+
|   Ankit| 25|
|Jalfaizy| 22|
| saurabh| 20|
|    Bala| 26|
+--------+---+



#### Method 3

In [18]:
l = [('Ankit',25),('Jalfaizy',22),('saurabh',20),('Bala',26)]

rdd = sc.parallelize(l)
df = rdd.toDF(["a","b"])


In [19]:
df.show()


+--------+---+
|       a|  b|
+--------+---+
|   Ankit| 25|
|Jalfaizy| 22|
| saurabh| 20|
|    Bala| 26|
+--------+---+



### Creating the DataFrame from CSV file

In [21]:
train = sqlContext.read.csv(path = '/home/hasan/DATA SET/ind-ban-comment.csv', header = True)


In [23]:
train.show(5)


+-------+-----------------+------+-----------------+--------------------+------+---------+---+------+----------+--------+----+----+-------------------+
|Batsman|     Batsman_Name|Bowler|      Bowler_Name|          Commentary|Detail|Dismissed| Id|Isball|Isboundary|Iswicket|Over|Runs|          Timestamp|
+-------+-----------------+------+-----------------+--------------------+------+---------+---+------+----------+--------+----+----+-------------------+
|  28994|   Mohammed Shami| 63881|Mustafizur Rahman|OUT! Bowled! 5-fe...|     W|    28994|346|  True|      null|       1|49.6|   0|2019-07-02 13:18:47|
|   5132|Bhuvneshwar Kumar| 63881|Mustafizur Rahman|WIDE AND RUN OUT!...|  W+wd|     5132|344|  True|      null|       1|49.6|   1|2019-07-02 13:17:28|
|  28994|   Mohammed Shami| 63881|Mustafizur Rahman|Back of a length ...|  null|     null|343|  True|      null|    null|49.5|   1|2019-07-02 13:16:03|
|   5132|Bhuvneshwar Kumar| 63881|Mustafizur Rahman|Just 1 run off th...|  null|     nul

### Some Built-in Function

In [24]:
train.printSchema()


root
 |-- Batsman: string (nullable = true)
 |-- Batsman_Name: string (nullable = true)
 |-- Bowler: string (nullable = true)
 |-- Bowler_Name: string (nullable = true)
 |-- Commentary: string (nullable = true)
 |-- Detail: string (nullable = true)
 |-- Dismissed: string (nullable = true)
 |-- Id: string (nullable = true)
 |-- Isball: string (nullable = true)
 |-- Isboundary: string (nullable = true)
 |-- Iswicket: string (nullable = true)
 |-- Over: string (nullable = true)
 |-- Runs: string (nullable = true)
 |-- Timestamp: string (nullable = true)



In [25]:
train.head(5)


[Row(Batsman='28994', Batsman_Name='Mohammed Shami', Bowler='63881', Bowler_Name='Mustafizur Rahman', Commentary='OUT! Bowled! 5-fer to finish a tremendous last over. His 4th 5-wicket haul in ODIs. Around off, Shami moves across to paddle but misses the ball hits his pads and goes onto hit the stumps. 2 wickets and just 3 runs from the final over. Top notch from Mustafizur. INDIA FINISH WITH 314/9 FROM THEIR 50 OVERS.', Detail='W', Dismissed='28994', Id='346', Isball='True', Isboundary=None, Iswicket='1', Over='49.6', Runs='0', Timestamp='2019-07-02 13:18:47'),
 Row(Batsman='5132', Batsman_Name='Bhuvneshwar Kumar', Bowler='63881', Bowler_Name='Mustafizur Rahman', Commentary="WIDE AND RUN OUT! Slower delivery outside off, it is on the wrong side of the tramline. Shami comes for a run. Bhuvi was slow though. Rahim throws it to Mustafizur, who hits the stumps at the bowler's end. The umpire takes it upstairs but Bhuvneshwar had started to walk back even before the replays rolled in.", Det

In [26]:
train.show(2,truncate= True)


+-------+-----------------+------+-----------------+--------------------+------+---------+---+------+----------+--------+----+----+-------------------+
|Batsman|     Batsman_Name|Bowler|      Bowler_Name|          Commentary|Detail|Dismissed| Id|Isball|Isboundary|Iswicket|Over|Runs|          Timestamp|
+-------+-----------------+------+-----------------+--------------------+------+---------+---+------+----------+--------+----+----+-------------------+
|  28994|   Mohammed Shami| 63881|Mustafizur Rahman|OUT! Bowled! 5-fe...|     W|    28994|346|  True|      null|       1|49.6|   0|2019-07-02 13:18:47|
|   5132|Bhuvneshwar Kumar| 63881|Mustafizur Rahman|WIDE AND RUN OUT!...|  W+wd|     5132|344|  True|      null|       1|49.6|   1|2019-07-02 13:17:28|
+-------+-----------------+------+-----------------+--------------------+------+---------+---+------+----------+--------+----+----+-------------------+
only showing top 2 rows



In [27]:
train.count()


605

In [28]:
len(train.columns)


14

In [29]:
train.describe().show()


+-------+------------------+-----------------+------------------+-----------------+--------------------+------+------------------+-----------------+------+----------+--------+------------------+------------------+-------------------+
|summary|           Batsman|     Batsman_Name|            Bowler|      Bowler_Name|          Commentary|Detail|         Dismissed|               Id|Isball|Isboundary|Iswicket|              Over|              Runs|          Timestamp|
+-------+------------------+-----------------+------------------+-----------------+--------------------+------+------------------+-----------------+------+----------+--------+------------------+------------------+-------------------+
|  count|               605|              605|               605|              605|                 605|    40|                19|              605|   605|        67|      19|               605|               605|                605|
|   mean|31971.652892561982|             null| 35304.43636363636

In [30]:
train.describe('Runs').show()


+-------+------------------+
|summary|              Runs|
+-------+------------------+
|  count|               605|
|   mean|0.9917355371900827|
| stddev| 1.342725481259329|
|    min|                 0|
|    max|                 6|
+-------+------------------+



In [31]:
train.select('Batsman','Runs').show(5)


+-------+----+
|Batsman|Runs|
+-------+----+
|  28994|   0|
|   5132|   1|
|  28994|   1|
|   5132|   1|
|   3676|   0|
+-------+----+
only showing top 5 rows



In [32]:
train.select('Batsman').distinct().count()


20