In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
from pyspark.sql import SparkSession
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
import re
from tqdm.notebook import tqdm
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from IPython.display import clear_output

nltk.download("punkt_tab")
nltk.download("stopwords")

spark = (
    SparkSession.builder.master("local[*]")
    .appName("Structured-Streaming")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

# 1- Embeddings and Helper Functions

## Load Glove Embeddings 

In [None]:
embeddings_index = {}

with open("data/glove.6B.50d.txt", "r", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = vector

## Normalization function for cleaning and tokenization

In [None]:
def normalize_text(text):
    # Regex to remove URLs, special characters, and extra spaces
    text = re.sub(r"[\r\n]+|https?://\S+|[^a-zA-Z0-9\s]", " ", text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words (optional)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

## Sentence Embeddings

In [None]:
def sentence_embedding(text, embeddings_index, embedding_dim=50):
    # Normalize text
    tokens = normalize_text(text)
    # Convert tokens to embeddings
    vectors = [embeddings_index.get(token, np.zeros(embedding_dim)) for token in tokens]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embedding_dim)

# 2-Train Logistic Regression for Classification

## Read IMDB Sentiment Dataset

In [None]:
imdb_dataset = pd.read_parquet("data/train-imdb.parquet")

imdb_dataset.head()

In [None]:
# Initialize tqdm for pandas
tqdm.pandas(desc="Creating embeddings")

# Apply the function on the 'text' column with a progress bar
imdb_dataset["embedding"] = imdb_dataset["text"].progress_apply(
    lambda x: sentence_embedding(x, embeddings_index)
)

imdb_dataset.head()

In [None]:
# First, stack your embeddings into a 2D array
X = np.stack(imdb_dataset["embedding"].values)
y = imdb_dataset["label"].values

# Perform stratified train-test split (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
model = LogisticRegression(n_jobs=-1, max_iter=100).fit(X_train, y_train)

# Optionally you can save the model using joblib or pickle
# joblib.dump(model, 'data/logistic_regression_model.pkl')

In [None]:
# Evaluate on test set
y_pred = model.predict(X_test)

# Print evaluation metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 3-Spark Structured Streaming for Text Classification
## Plotting and Embedding Functions

In [None]:
# Initialize storage for batch-wise counts
batch_pos_counts, batch_neg_counts, batch_ids = [], [], []

# Create a function to plot both the trend and current batch pie chart
def plot_sentiment(pos_count, neg_count, batch_id):
    # Set up a figure with two subplots - line chart and pie chart
    plt.figure(figsize=(16, 6))
    
    # Line plot showing trends over batches (left subplot)
    plt.subplot(1, 2, 1)
    plt.plot(batch_ids, batch_pos_counts, 'g-o', label="Positive")
    plt.plot(batch_ids, batch_neg_counts, 'r-o', label="Negative")
    plt.xlabel("Batch")
    plt.ylabel("Count")
    plt.title("Sentiment Analysis Results by Batch")
    plt.legend()
    plt.grid(True)
    plt.ylim(bottom=0)
    
    # Pie chart for current batch (right subplot)
    plt.subplot(1, 2, 2)
    labels = ['Positive', 'Negative']
    sizes = [pos_count, neg_count]
    colors = ['green', 'red']
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
    plt.axis('equal')  # Equal aspect ratio ensures the pie is circular
    plt.title(f"Batch {batch_id} Sentiment Distribution")
    
    plt.tight_layout()
    
    # Display and ensure it stays visible
    clear_output(wait=True)
    plt.show()

def process_batch(batch_df, batch_id):
    pdf = batch_df.toPandas()
    if pdf.empty:
        print(f"Batch {batch_id}: Empty batch")
        return
    
    # Vectorize and predict
    pdf["embedding"] = pdf["text"].apply(lambda x: sentence_embedding(x, embeddings_index))
    X_batch = np.stack(pdf["embedding"].values)
    y_pred = model.predict(X_batch)
    
    # Count positives and negatives
    pos_count = np.sum(y_pred == 1)
    neg_count = np.sum(y_pred == 0)
    batch_pos_counts.append(pos_count)
    batch_neg_counts.append(neg_count)
    batch_ids.append(batch_id)
    
    # Create and display plots
    plot_sentiment(pos_count, neg_count, batch_id)

## Spark Structured Streaming

In [None]:
# Define the streaming source
yelp_stream = (
    spark.readStream.format("parquet")
    .schema(StructType([StructField("text", StringType(), True)]))
    .option("maxFilesPerTrigger", 1)
    .load("data/yelp/")
)

# Start the streaming query
query = (
    yelp_stream.writeStream
    .foreachBatch(process_batch)
    .outputMode("append")
    .start()
)

try:
    query.awaitTermination(timeout=600)
    print("Query terminated after timeout")
    
except Exception as e:
    print(f"Query terminated due to: {e}")