In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import date, timedelta, datetime
import time

### Initialize spark session

In [2]:
sc = SparkSession.builder.appName("Example1")\
     .config('spark.sql.shuffle.partitions', '50')\
     .config('spark.driver.maxResultSize', '5g')\
     .config('spark.sql.execution.arrow.enabled', 'true')\
     .getOrCreate()

In [3]:
dataframe = sc.read.csv('data/bbc-text.csv', header=True)

In [4]:
dataframe.show(5)

+-------------+--------------------+
|     category|                text|
+-------------+--------------------+
|         tech|tv future in the ...|
|     business|worldcom boss  le...|
|        sport|tigers wary of fa...|
|        sport|yeading face newc...|
|entertainment|ocean s twelve ra...|
+-------------+--------------------+
only showing top 5 rows



In [14]:
dataframe.columns

['category', 'text']

In [15]:
dataframe_drp = dataframe.dropDuplicates()
dataframe_drp.show(5)

+-------------+--------------------+
|     category|                text|
+-------------+--------------------+
|         tech|web radio takes s...|
|        sport|chelsea denied by...|
|        sport|keegan hails come...|
|     business|brewers  profits ...|
|entertainment|keanu reeves give...|
+-------------+--------------------+
only showing top 5 rows



In [17]:
dataframe.select('category').show(10)

+-------------+
|     category|
+-------------+
|         tech|
|     business|
|        sport|
|        sport|
|entertainment|
|     politics|
|     politics|
|        sport|
|        sport|
|entertainment|
+-------------+
only showing top 10 rows



In [18]:
dataframe.select('category', when(dataframe.category != 'entertainment', 1).otherwise(0)).show(10)

+-------------+------------------------------------------------------------+
|     category|CASE WHEN (NOT (category = entertainment)) THEN 1 ELSE 0 END|
+-------------+------------------------------------------------------------+
|         tech|                                                           1|
|     business|                                                           1|
|        sport|                                                           1|
|        sport|                                                           1|
|entertainment|                                                           0|
|     politics|                                                           1|
|     politics|                                                           1|
|        sport|                                                           1|
|        sport|                                                           1|
|entertainment|                                                           0|

In [19]:
dataframe[dataframe.category.isin('tech', 'business')].show(10)

+--------+--------------------+
|category|                text|
+--------+--------------------+
|    tech|tv future in the ...|
|business|worldcom boss  le...|
|business|virgin blue share...|
|business|crude oil prices ...|
|business|s korean credit c...|
|business|japanese banking ...|
|    tech|games maker fight...|
|    tech|halo 2 heralds tr...|
|    tech|mobile audio ente...|
+--------+--------------------+
only showing top 10 rows



In [21]:
dataframe.select('category', 'text', dataframe.text.like('% wars %')).show(10)

+-------------+--------------------+------------------+
|     category|                text|text LIKE % wars %|
+-------------+--------------------+------------------+
|         tech|tv future in the ...|             false|
|     business|worldcom boss  le...|             false|
|        sport|tigers wary of fa...|             false|
|        sport|yeading face newc...|             false|
|entertainment|ocean s twelve ra...|             false|
|     politics|howard hits back ...|             false|
|     politics|blair prepares to...|             false|
|        sport|henman hopes ende...|             false|
|        sport|wilkinson fit to ...|             false|
|entertainment|last star wars  n...|              true|
+-------------+--------------------+------------------+
only showing top 10 rows



In [22]:
dataframe.select('category', 'text', dataframe.text.startswith('tv')).show(5)

+-------------+--------------------+--------------------+
|     category|                text|startswith(text, tv)|
+-------------+--------------------+--------------------+
|         tech|tv future in the ...|                true|
|     business|worldcom boss  le...|               false|
|        sport|tigers wary of fa...|               false|
|        sport|yeading face newc...|               false|
|entertainment|ocean s twelve ra...|               false|
+-------------+--------------------+--------------------+
only showing top 5 rows



In [25]:
dataframe.select('category', 'text', dataframe.text.endswith('january.')).show(5)

+-------------+--------------------+------------------------+
|     category|                text|endswith(text, january.)|
+-------------+--------------------+------------------------+
|         tech|tv future in the ...|                   false|
|     business|worldcom boss  le...|                   false|
|        sport|tigers wary of fa...|                   false|
|        sport|yeading face newc...|                    true|
|entertainment|ocean s twelve ra...|                   false|
+-------------+--------------------+------------------------+
only showing top 5 rows



In [26]:
dataframe.select(dataframe.text.substr(1, 15)).show(5)

+----------------------+
|substring(text, 1, 15)|
+----------------------+
|       tv future in th|
|       worldcom boss  |
|       tigers wary of |
|       yeading face ne|
|       ocean s twelve |
+----------------------+
only showing top 5 rows



In [5]:
dataframe.dtypes

[('category', 'string'), ('text', 'string')]

In [6]:
dataframe.head()

Row(category='tech', text='tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are a

In [7]:
dataframe.first()

Row(category='tech', text='tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are a

In [8]:
dataframe.take(2)

[Row(category='tech', text='tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are 

In [9]:
dataframe.describe().show()

+-------+--------+--------------------+
|summary|category|                text|
+-------+--------+--------------------+
|  count|    2225|                2225|
|   mean|    null|                null|
| stddev|    null|                null|
|    min|business|$1m payoff for fo...|
|    max|    tech|zambia confident ...|
+-------+--------+--------------------+



In [10]:
dataframe.count()

2225

In [11]:
dataframe.distinct().count()

2126

In [14]:
dataframe.explain()

== Physical Plan ==
FileScan csv [category#16,text#17] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/chamath/Documents/PyProjects/Exercises/PySpark exercises/bb..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<category:string,text:string>




In [16]:
dataframe = dataframe.withColumn('new_column', lit('A new column'))
dataframe.show(5)

+-------------+--------------------+------------+
|     category|                text|  new_column|
+-------------+--------------------+------------+
|         tech|tv future in the ...|A new column|
|     business|worldcom boss  le...|A new column|
|        sport|tigers wary of fa...|A new column|
|        sport|yeading face newc...|A new column|
|entertainment|ocean s twelve ra...|A new column|
+-------------+--------------------+------------+
only showing top 5 rows



In [17]:
dataframe = dataframe.withColumnRenamed('new_column', 'New_column')
dataframe.show(5)

+-------------+--------------------+------------+
|     category|                text|  New_column|
+-------------+--------------------+------------+
|         tech|tv future in the ...|A new column|
|     business|worldcom boss  le...|A new column|
|        sport|tigers wary of fa...|A new column|
|        sport|yeading face newc...|A new column|
|entertainment|ocean s twelve ra...|A new column|
+-------------+--------------------+------------+
only showing top 5 rows



In [21]:
#dataframe = dataframe.drop('New_column').show(5)

In [22]:
dataframe = sc.read.csv('bbc-text.csv', header=True)

In [23]:
dataframe.show(5)

+-------------+--------------------+
|     category|                text|
+-------------+--------------------+
|         tech|tv future in the ...|
|     business|worldcom boss  le...|
|        sport|tigers wary of fa...|
|        sport|yeading face newc...|
|entertainment|ocean s twelve ra...|
+-------------+--------------------+
only showing top 5 rows



In [24]:
dataframe.groupBy('category').count().show(5)

+-------------+-----+
|     category|count|
+-------------+-----+
|entertainment|  386|
|        sport|  511|
|     business|  510|
|     politics|  417|
|         tech|  401|
+-------------+-----+



In [27]:
dataframe.createOrReplaceTempView('bbc_text')

In [26]:
#dataframe.registerTempTable('df')



In [28]:
sc.sql('SELECT * FROM bbc_text').show(3)

+--------+--------------------+
|category|                text|
+--------+--------------------+
|    tech|tv future in the ...|
|business|worldcom boss  le...|
|   sport|tigers wary of fa...|
+--------+--------------------+
only showing top 3 rows



In [32]:
"""sc.sql("SELECT \
        CASE WHEN text LIKE %UK% THEN 'UK' \
        WHEN text LIKE %US% THEN 'US' \
        ELSE 'Other' \
        END Themes \
        FROM bbc_text").groupBy('Themes').count().show()"""

'sc.sql("SELECT         CASE WHEN text LIKE %UK% THEN \'UK\'         WHEN text LIKE %US% THEN \'US\'         ELSE \'Other\'         END Themes         FROM bbc_text").groupBy(\'Themes\').count().show()'

In [33]:
rdd_convert = dataframe.rdd

In [34]:
dataframe.toJSON().first()

'{"category":"tech","text":"tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are 

In [35]:
dataframe.toPandas()

  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [None]:
# Write & Save File in .json format
#dataframe.select("author", "title") \.write \.save("Authors_Titles.json",format="json")

In [36]:
dataframe.select('category', 'text').write.save('bbc_text.parquet')

In [37]:
sc.stop()