In [1]:
text_data = sqlContext.read.format('text')\
    .options(header='true', inferSchema='true')\
    .load("../Spark-Example-Word-Count/20-news-same-line.txt")

In [2]:
text_data.show()

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

+--------------------+
|               value|
+--------------------+
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
|<doc id="20_newsg...|
+--------------------+
only showing top 20 rows



In [3]:
text_data.printSchema()

root
 |-- value: string (nullable = true)



In [4]:
text_data.first()

Row(value='<doc id="20_newsgroups/comp.graphics/37261" url="" title="20_newsgroups/comp.graphics/37261"> From: lipman@oasys.dt.navy.mil (Robert Lipman) Subject: CALL FOR PRESENTATIONS: Navy SciViz/VR Seminar Date: 19 Mar 93 20:10:23 GMT Distribution: usa Lines: 65   CALL FOR PRESENTATIONS  NAVY SCIENTIFIC VISUALIZATION AND VIRTUAL REALITY SEMINAR  Tuesday, June 22, 1993  Carderock Division, Naval Surface Warfare Center formerly the David Taylor Research Center) Bethesda, Maryland  SPONSOR: NESS (Navy Engineering Software System) is sponsoring a  one-day Navy Scientific Visualization and Virtual Reality Seminar.   The purpose of the seminar is to present and exchange information for Navy-related scientific visualization and virtual reality programs,  research, developments, and applications.  PRESENTATIONS: Presentations are solicited on all aspects of  Navy-related scientific visualization and virtual reality.  All  current work, works-in-progress, and proposed work by Navy  organizati

In [6]:
from pyspark.sql.functions import split, explode, count, desc

# Assuming your DataFrame is named 'text_data'
# Split the text_data column into words using spaces as the delimiter
words_df = text_data.select(split(text_data.value, " ").alias("words"))
# Explode the list of words into individual rows
exploded_words_df = words_df.select(explode(words_df.words).alias("word"))
exploded_words_df.show()

+--------------------+
|                word|
+--------------------+
|                <doc|
|id="20_newsgroups...|
|              url=""|
|title="20_newsgro...|
|               From:|
|lipman@oasys.dt.n...|
|             (Robert|
|             Lipman)|
|            Subject:|
|                CALL|
|                 FOR|
|      PRESENTATIONS:|
|                Navy|
|           SciViz/VR|
|             Seminar|
|               Date:|
|                  19|
|                 Mar|
|                  93|
|            20:10:23|
+--------------------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import split, explode, count, desc

# Assuming your DataFrame is named 'text_data'
# Split the text_data column into words using spaces as the delimiter
words_df = text_data.select(split(text_data.value, " ").alias("words"))

# Explode the list of words into individual rows
exploded_words_df = words_df.select(explode(words_df.words).alias("word"))

# Perform the Word Count
word_count_df = exploded_words_df.groupBy("word").agg(count("*").alias("count"))

# Show or save the results
word_count_df.show()




+--------------------+-----+
|                word|count|
+--------------------+-----+
|           Factsnet:|    3|
|               input|  364|
|                some|10015|
|               those| 4797|
|                 few| 2441|
|title="20_newsgro...|    1|
|                 UAB|    4|
|               still| 3206|
|                  By|  799|
|                 X\t|  343|
|                hope|  982|
|           standards|  208|
|            15:02:12|    1|
|           connected|  227|
|              travel|   84|
|            20:21:46|    1|
|   recently-released|    1|
|             from/to|    2|
|                 ...| 1237|
|title="20_newsgro...|    1|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [8]:
# Get the top 10 words with the highest counts
top_10_words = word_count_df.orderBy(desc("count")).limit(10)
top_10_words.show()



+----+------+
|word| count|
+----+------+
|    |909755|
| the|225468|
|  to|124251|
|  of|113153|
|   a| 98699|
| and| 93602|
|  is| 69306|
|   I| 69160|
|  in| 66165|
|that| 63147|
+----+------+



                                                                                

# Using withColumn() operation

In [18]:
# Assuming your DataFrame is named 'text_data'
# Split the text_data column into words using spaces as the delimiter
words_df = text_data.withColumn("words", split(text_data.value, " "))

# Explode the list of words into individual rows
exploded_words_df = words_df.select(explode(words_df.words).alias("word"))

# Perform the Word Count
word_count_df = exploded_words_df.groupBy("word").agg(count("*").alias("count"))

# Get the top 10 words with the highest counts
top_10_words = word_count_df.orderBy(desc("count")).limit(10)

# Show or save the top 10 results
top_10_words.show()




+----+------+
|word| count|
+----+------+
|    |909755|
| the|225468|
|  to|124251|
|  of|113153|
|   a| 98699|
| and| 93602|
|  is| 69306|
|   I| 69160|
|  in| 66165|
|that| 63147|
+----+------+



                                                                                