In [2]:
# Cell 1: Setup
"""
Test notebook for data ingestion
Run this to verify your setup is working correctly
"""
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

# Cell 2: Test Spark Setup
from config.spark_config import create_spark_session
spark = create_spark_session("TestIngestion")
print(f"Spark version: {spark.version}")
print(f"Spark UI available at: http://localhost:4040")

# Cell 3: Test Data Loading
from src.spark.data_ingestion import DataIngestion

class DataIngestion:
    def __init__(self, spark):
        self.spark = spark


    def load_sentiment140_data(self, path):
        """
        Load the Sentiment140 dataset from the specified CSV file.
        Parameters:
        path (str): Path to the CSV file.
                Returns:
        DataFrame: Spark DataFrame containing the loaded data.
        """
        df = self.spark.read.csv(
            path,
            header=False,
            inferSchema=True,
            encoding='iso-8859-1',
            multiLine=True,
            escape='"'
        )
        # Further processing here
        return df

ingestion = DataIngestion(spark)

# Load sample data
data_path = "../data/raw/training.1600000.processed.noemoticon.csv"
df = ingestion.load_sentiment140_data(data_path)
print(f"Loaded {df.count()} records")

# Cell 4: Show Data Schema
df.printSchema()

# Cell 5: Sample Data
# df.select("text", "sentiment", "timestamp").show(10, truncate=False)
df.select(df["_c5"].alias("text"), df["_c0"].alias("sentiment"), df["_c2"].alias("timestamp")).show(10, truncate=False)

# Cell 6: Sentiment Distribution
df.groupBy("_c4").count().show()

# Cell 7: Time Distribution
# df.groupBy("year", "month").count().orderBy("year", "month").show()

INFO:config.spark_config:Creating Spark session with driver memory: 12g
INFO:config.spark_config:Spark session created successfully
INFO:config.spark_config:Spark version: 3.3.4
INFO:config.spark_config:Driver memory: 12g
INFO:config.spark_config:Executor memory: 8g


Spark version: 3.3.4
Spark UI available at: http://localhost:4040


                                                                                

Loaded 1600000 records
root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)

+-------------------------------------------------------------------------------------------------------------------+---------------+----------------------------+
|text                                                                                                               |sentiment      |timestamp                   |
+-------------------------------------------------------------------------------------------------------------------+---------------+----------------------------+
|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|_TheSpecialOne_|Mon Apr 06 22:19:45 PDT 2009|
|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Bla

[Stage 15:>                                                         (0 + 1) / 1]

+---------------+-----+
|            _c4|count|
+---------------+-----+
|     megan_rice|   15|
|     LeeseEllen|    2|
|      dEnNy_333|    1|
|        lisha_e|    2|
|        Daniiej|    3|
|       cmsebest|    1|
|         MeghTW|    1|
|   candicebunny|    1|
|stranger_danger|   14|
|  divingkid2001|    1|
| BIGBANGkrystal|    2|
|   annelisebaer|    1|
|    Lilli_Allen|    1|
| anthonypending|    1|
|        caaaami|    1|
|      ffmusicdj|    4|
|  prasannathani|   16|
|          Colsi|    1|
|       J_Moneyy|    7|
|        SoEdith|    5|
+---------------+-----+
only showing top 20 rows



                                                                                