In [None]:
pip install pyspark

In [None]:
# Required imports
import sys
from operator import add
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark import SparkContext
from pyspark.streaming import StreamingContext


In [None]:
# Function to process streaming data and generate word counts
def process_stream(rdd):
    if not rdd.isEmpty():
        word_counts = rdd.withColumn('word', F.explode(F.col('words_clean'))) \
                         .groupBy('word') \
                         .count() \
                         .sort('count', ascending=False)

        # Show the result
        word_counts.show()

        # Visualize word counts
        word_counts_pd = word_counts.limit(10).toPandas()  # Limit to top 10 words for visualization
        plt.figure(figsize=(10, 6))
        plt.bar(word_counts_pd['word'], word_counts_pd['count'])
        plt.xlabel('Words')
        plt.ylabel('Count')
        plt.title('Top 10 Words in Billboard Songs')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
# Initialize SparkContext and StreamingContext
sc = SparkContext(appName="RealTimeWordCount")
ssc = StreamingContext(sc, 5)  # 5-second micro-batch interval

In [None]:
# Initialize SparkSession
spark = SparkSession.builder.appName("PythonWordCount").getOrCreate()

In [None]:
# Read CSV data
data = spark.read.format('csv').options(header='true', inferSchema='true') \
    .load('billboard_lyrics_1964-2015.csv')
print('############ CSV extract:')
data.show()

In [None]:
tokenizer = Tokenizer(inputCol="Lyrics", outputCol="words_token")
tokenized = tokenizer.transform(data).select('Rank', 'words_token')

print('############ Tokenized data extract:')
tokenized.show(truncate=False)

In [None]:
# Remove stop words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
data_clean = remover.transform(tokenized).select('Rank', 'words_clean')

print('############ Data Cleaning extract:')
data_clean.show(truncate=False)

In [None]:
# Final word count
result = data_clean.withColumn('word', F.explode(F.col('words_clean'))) \
                   .groupBy('word') \
                   .count().sort('count', ascending=False)

print('############ Final word count:')
result.show()

In [None]:
# Tokenize and remove stop words
import matplotlib.pyplot as plt

# Visualize word counts
result_pd = result.limit(10).toPandas()  # Limit to top 10 words for visualization
plt.figure(figsize=(10, 6))
plt.bar(result_pd['word'], result_pd['count'])
plt.xlabel('Words')
plt.ylabel('Count')
plt.title('Top 10 Words in Billboard Songs')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Create DStream by reading text files from the directory
data_dir = "./data_chunks"
stream = ssc.textFileStream(data_dir)

In [None]:
# Process each RDD in the stream
stream.foreachRDD(process_stream)

In [None]:
# Start streaming
ssc.start()

In [None]:
# Wait for streaming to finish
ssc.awaitTermination()