In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.conf import SparkConf
import pyspark

def parseInput(line):
    fields = line.split('|')
    return Row(user_id = int(fields[0]), age = int(fields[1]), gender = fields[2], occupation = fields[3], zip = fields[4])

# SOLUTION 1: Stop any existing Spark sessions first
try:
    # Stop any existing Spark session/context
    if 'spark' in globals():
        spark.stop()
    
    # Also stop SparkContext if it exists
    sc = pyspark.SparkContext._active_spark_context
    if sc is not None:
        sc.stop()
except:
    pass

# SOLUTION 2: Start with local mode first (easier for development)
# Configure Spark for local mode with Cassandra connection
spark = SparkSession.builder \
    .appName("CassandraIntegration") \
    .master("local[*]") \
    .config("spark.cassandra.connection.host", "192.168.56.101") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.sql.adaptive.enabled", "false") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# Test the connection
print(f"Spark Version: {spark.version}")
print(f"Spark Master: {spark.sparkContext.master})

# Your data processing code here
# df = spark.read.text("your_data_file.txt")
# parsed_df = df.rdd.map(parseInput).toDF()

Spark Version: 3.5.3
Spark Master: local[*]


In [None]:
spark.stop()

In [2]:
# Import SparkContext from the PySpark library
from pyspark import SparkContext

# Initialize SparkContext
# Connects to a Spark cluster running locally on the same machine
# Gives the Spark application the name "FlatMapExample"
sc = SparkContext("local", "FlatMapExample")

# Create an RDD containing a list of words
words_rdd = sc.parallelize(["Hello", "world", "how", "are", "you"])

# Define a function to split each word into its individual characters
def split_word(word):
    return list(word)  # list() splits the string into its individual characters

# Apply flatMap transformation to split each word into individual characters
characters_rdd = words_rdd.flatMap(split_word)

# Collect the result into a list for printing
result = characters_rdd.collect()

# Print the result
print("Original Words:", words_rdd.collect())
print("Characters:", result)

# Stop the SparkContext when done (good practice)
sc.stop()

Original Words: ['Hello', 'world', 'how', 'are', 'you']
Characters: ['H', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd', 'h', 'o', 'w', 'a', 'r', 'e', 'y', 'o', 'u']


In [1]:
import os
import sys

# Set Python executable paths explicitly
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

print("Python path set to:", sys.executable)

Python path set to: c:\Users\Hathim\anaconda3\envs\MSCDM\python.exe
