In [70]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark session
spark = SparkSession.builder \
    .appName("TrainPredictionModel") \
    .getOrCreate()

# Load training data
train_path = "Train.csv"
train_data = spark.read.csv(train_path, header=True, inferSchema=True)

# Display first few rows of training data to understand its structure
train_data.show(5)
train_data.printSchema()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|      Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|              Tier 1|Supermarket Type1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|              Tier 3|Superma

In [71]:
# Check for null values in the dataset
train_data.select([col(c).isNull().alias(c) for c in train_data.columns]).show()

# Drop rows with null values (or fill them as needed)
train_data = train_data.na.drop()
train_data.show(5)

+---------------+-----------+----------------+---------------+---------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+---------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|          false|      false|           false|          false|    false|   false|            false|                    false|      false|               false|      false|            false|
|          false|      false|           false|          false|    false|   false|            false|                    false|      false|               false|      false|            false|
|          false|      false|           false|         

In [72]:
# Convert categorical columns to string type
categorical_columns = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

for col_name in categorical_columns:
    train_data = train_data.withColumn(col_name, train_data[col_name].cast('string'))
    

# Convert numerical columns to double type
numerical_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']
for col_name in numerical_columns:
    train_data = train_data.withColumn(col_name, train_data[col_name].cast('double'))

In [73]:
# Apply string indexer and one hot encoder to categorical columns
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index", handleInvalid='skip') for col in categorical_columns]
encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded") for col in categorical_columns]

In [74]:
# create feature vector for all columns
feature_columns =  ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']\

# [col + "_encoded" for col in categorical_columns] +

In [75]:


from pyspark.ml.regression import LinearRegression

train_data = train_data.withColumn(
    "label",
    when(col("Item_Outlet_Sales") < 500, 0)
    .when((col("Item_Outlet_Sales") >= 500) & (col("Item_Outlet_Sales") < 1000), 1)
    .otherwise(2)
)

# Assemble features
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Define Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Create pipeline
pipeline = Pipeline(stages=[assembler, lr])

# Tạo pipeline
assembler = VectorAssembler(inputCols=["Item_Weight", "Item_Visibility", "Item_MRP", "Outlet_Establishment_Year"], outputCol="features")
lr = LinearRegression(featuresCol="features", labelCol="Item_Outlet_Sales")
pipeline = Pipeline(stages=[assembler, lr])

# Split data into training and testing
train_set, test_set = train_data.randomSplit([0.8, 0.2], seed=42)

# Train the model
try:
    model = pipeline.fit(train_set)
    print("Model training successful!")

except Exception as e:
    print(f"Error during model training: {e}")

# Evaluate the model on the test set
try:
    predictions = model.transform(test_set)
    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy"
    )
    accuracy = evaluator.evaluate(predictions)
    print(f"Test Accuracy: {accuracy}")
except Exception as e:
    print(f"Error during model evaluation: {e}")
    
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="Item_Outlet_Sales", predictionCol="prediction", metricName="f1"
)
f1_score = evaluator_f1.evaluate(predictions)
print(f"F1-Score = {f1_score}")



25/01/14 01:15:34 WARN Instrumentation: [0838afcf] regParam is zero, which might cause numerical instability and overfitting.


Model training successful!
Test Accuracy: 0.0
F1-Score = 0.0


In [76]:
print(type(model))


<class 'pyspark.ml.pipeline.PipelineModel'>


In [77]:
# Save Model
# import os
# os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk-1.8'
# os.environ['HADOOP_HOME'] = r'D:\Spark\spark-3.5.3-bin-hadoop3'

model_path = 'model'

try:
    model.write().overwrite().save(model_path)
    print(f"Model saved successfully to {model_path}")
except Exception as e:
    print(f"Error saving model: {e}")

Model saved successfully to model


In [78]:
# from pyspark.sql import SparkSession
# from pyspark.ml.feature import VectorAssembler
# from pyspark.ml.regression import LinearRegression
# from pyspark.ml import Pipeline

# # Initialize Spark Session
# spark = SparkSession.builder \
#     .appName("TrainingModel") \
#     .getOrCreate()

# # Load Dataset
# dataset_path = "Train.csv"  # Path to the dataset
# df = spark.read.csv(dataset_path, header=True, inferSchema=True)

# # Define Feature Columns and Label
# feature_columns = ["Item_Weight", "Item_Visibility", "Item_MRP", "Outlet_Establishment_Year"]
# assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# # Define Linear Regression Model
# lr = LinearRegression(featuresCol="features", labelCol="Item_Outlet_Sales")

# # Create Pipeline
# pipeline = Pipeline(stages=[assembler, lr])

# # Train Model
# model = pipeline.fit(df)

# # Save Model
# model_path = "model"
# model.write().overwrite().save(model_path)

# print("Model training completed and saved.")
