In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, year, month, dayofmonth
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Initialize Spark session with adjusted configurations
spark = SparkSession.builder \
    .appName("RetailDataAnalysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.task.cpus", "1") \
    .config("spark.sql.shuffle.partitions", "50") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

# Load dataset
file_path = "retail_data.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Convert InvoiceDate to datetime and create year, month, day columns
df = df.withColumn("InvoiceDate", to_timestamp(col("InvoiceDate"), "dd/MM/yyyy HH:mm"))
df = df.withColumn("Year", year(col("InvoiceDate")))
df = df.withColumn("Month", month(col("InvoiceDate")))
df = df.withColumn("Day", dayofmonth(col("InvoiceDate")))

# Drop rows with missing CustomerID and convert CustomerID to string
df = df.dropna(subset=['CustomerID'])
df = df.withColumn("CustomerID", df["CustomerID"].cast("string"))

# Verify data types
df.printSchema()

# Encode the target variable (Country)
indexer = StringIndexer(inputCol="Country", outputCol="CountryIndex")
df = indexer.fit(df).transform(df)

# Ensure feature columns are numeric
df = df.withColumn("Quantity", col("Quantity").cast("double"))
df = df.withColumn("UnitPrice", col("UnitPrice").cast("double"))
df = df.withColumn("TotalAmount", col("TotalAmount").cast("double"))

# Select features and target variable
feature_cols = ["Quantity", "UnitPrice", "TotalAmount"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# Split data into training and test sets
train, test = df.randomSplit([0.7, 0.3], seed=42)

# Feature engineering steps
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# Define the classifier
rf = RandomForestClassifier(featuresCol="scaledFeatures", labelCol="CountryIndex", numTrees=10)

# Define the pipeline
pipeline = Pipeline(stages=[scaler, rf])

# Train the model
model = pipeline.fit(train)

# Make predictions
predictions = model.transform(test)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="CountryIndex", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

# Confusion matrix and classification report
predictions.groupBy("CountryIndex", "prediction").count().show()
print(f"Test Accuracy: {accuracy:.6f}")

# Stop Spark session
spark.stop()


                                                                                

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)



                                                                                



Exception in thread "block-manager-storage-async-thread-pool-76" java.lang.OutOfMemoryError: unable to create native thread: possibly out of memory or process/resource limits reached
	at java.base/java.lang.Thread.start0(Native Method)
	at java.base/java.lang.Thread.start(Thread.java:798)
	at java.base/java.util.concurrent.ThreadPoolExecutor.addWorker(ThreadPoolExecutor.java:937)
	at java.base/java.util.concurrent.ThreadPoolExecutor.processWorkerExit(ThreadPoolExecutor.java:1005)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
java.lang.IllegalStateException: problem in scala.concurrent internal callback
	at scala.concurrent.Future$InternalCallbackExecutor$.reportFailure(Future.scala:877)
	at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:72)
	at scala.concurrent.impl.Promis



java.lang.IllegalStateException: problem in scala.concurrent internal callback
	at scala.concurrent.Future$InternalCallbackExecutor$.reportFailure(Future.scala:877)
	at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:72)
	at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:288)
	at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:288)
	at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:288)
	at scala.concurrent.Promise.tryFailure(Promise.scala:112)
	at scala.concurrent.Promise.tryFailure$(Promise.scala:112)
	at scala.concurrent.impl.Promise$DefaultPromise.tryFailure(Promise.scala:187)
	at org.apache.spark.rpc.netty.NettyRpcEnv.org$apache$spark$rpc$netty$NettyRpcEnv$$onFailure$1(NettyRpcEnv.scala:214)
	at org.apache.spark.rpc.netty.NettyRpcEnv.onSuccess$1(NettyRpcEnv.scala:223)
	at org.apache.spark.rpc.netty.NettyRpcEnv.$anonfun$askAbortable$5(NettyRpcEnv.scala:239)
	at

+------------+----------+------+
|CountryIndex|prediction| count|
+------------+----------+------+
|         2.0|       0.0|  2529|
|        15.0|       0.0|   126|
|        22.0|       0.0|    78|
|        34.0|       0.0|    13|
|        23.0|       0.0|    54|
|        31.0|       0.0|    13|
|         0.0|       0.0|108414|
|        28.0|       0.0|    14|
|        17.0|       0.0|   124|
|        36.0|       0.0|     3|
|        18.0|       0.0|   111|
|        16.0|       0.0|   125|
|        14.0|       0.0|   169|
|        12.0|       0.0|   218|
|         4.0|       0.0|   761|
|        19.0|       0.0|    99|
|        26.0|       0.0|    42|
|        25.0|       0.0|    54|
|        32.0|       0.0|    11|
|        33.0|       0.0|    11|
+------------+----------+------+
only showing top 20 rows

Test Accuracy: 0.888712


24/08/01 05:24:19 ERROR BlockManagerMasterEndpoint: Fail to know the executor driver is alive or not.
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint$$anonfun$handleBlockRemovalFailure$1.applyOrElse(BlockManag

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings

# Load dataset
file_path = "retail_data.csv"
df = pd.read_csv(file_path)

# Preprocess the data
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d/%m/%Y %H:%M')

# Drop rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Convert CustomerID to string
df['CustomerID'] = df['CustomerID'].astype(str)

# Encode the target variable (Country)
le = LabelEncoder()
df['Country'] = le.fit_transform(df['Country'])

# Select features and target variable
X = df[['Quantity', 'UnitPrice', 'TotalAmount']]
y = df['Country']

# Sample the data for initial testing (optional, remove for full dataset)
X, _, y, _ = train_test_split(X, y, test_size=0.9, random_state=42)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define feature engineering steps including preprocessing
feature_engineering_steps = [
    ('scaling', StandardScaler()),                   # Step 1: Standard scaling
    ('feature_selection', SelectKBest(f_classif))    # Step 2: Feature selection (SelectKBest)
    # Omitting PolynomialFeatures and PCA for now
]

# Classifiers to evaluate
classifiers = [
    ('svm', SVC()),                                 # SVM classifier
    ('random_forest', RandomForestClassifier())     # Random Forest classifier
]

# List to store results
results = []

# Iterate over classifiers
for clf_name, clf_method in classifiers:
    # Define pipeline with warning suppression
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")  # Ignore warnings
        pipeline = Pipeline([
            ('feature_engineering', Pipeline(feature_engineering_steps)),  # Feature engineering steps
            ('classification', clf_method)  # Classifier
        ])

        # Perform grid search with cross-validation
        grid_search = GridSearchCV(pipeline, param_grid={}, cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)

        # Best parameters and best score
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        # Predict on test set with best model
        y_pred = grid_search.predict(X_test)

        # Calculate accuracy
        test_accuracy = accuracy_score(y_test, y_pred)

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Classification report for more detailed evaluation
        clf_report = classification_report(y_test, y_pred, labels=le.transform(le.classes_), target_names=le.classes_)

        # Print confusion matrix and classification report
        print(f"Classifier: {clf_name}")
        print(f"Train Accuracy (CV): {best_score:.6f}")
        print(f"Test Accuracy: {test_accuracy:.6f}")
        print("Confusion Matrix:")
        print(cm)
        print("Classification Report:")
        print(clf_report)
        print("=" * 50)

        # Store results if needed
        results.append({
            'Classifier': clf_name,
            'Train Accuracy (CV)': best_score,
            'Test Accuracy': test_accuracy,
            'Confusion Matrix': cm,
            'Classification Report': clf_report
        })


Classifier: svm
Train Accuracy (CV): 0.888401
Test Accuracy: 0.888243
Confusion Matrix:
[[    0     0     0 ...     0    32     0]
 [    0     0     0 ...     0    12     0]
 [    0     0     0 ...     0    51     0]
 ...
 [    0     0     0 ...     0     1     0]
 [    0     0     0 ...     0 10841     0]
 [    0     0     0 ...     0     8     0]]
Classification Report:
                      precision    recall  f1-score   support

           Australia       0.00      0.00      0.00        32
             Austria       0.00      0.00      0.00        12
             Bahrain       0.00      0.00      0.00         0
             Belgium       0.00      0.00      0.00        51
              Brazil       0.00      0.00      0.00         1
              Canada       0.00      0.00      0.00         2
     Channel Islands       0.00      0.00      0.00        19
              Cyprus       0.00      0.00      0.00         9
      Czech Republic       0.00      0.00      0.00         2
    

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import warnings

# Load dataset
file_path = "retail_data.csv"
df = pd.read_csv(file_path)

# Preprocess the data
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d/%m/%Y %H:%M')

# Drop rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Convert CustomerID to string
df['CustomerID'] = df['CustomerID'].astype(str)

# Encode the target variable (Country)
le = LabelEncoder()
df['Country'] = le.fit_transform(df['Country'])

# Select features and target variable
X = df[['Quantity', 'UnitPrice', 'TotalAmount']]
y = df['Country']

# Sample the data for initial testing (optional, remove for full dataset)
X, _, y, _ = train_test_split(X, y, test_size=0.9, random_state=42)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Custom transformer to select a classifier
class ClassifierTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, classifier):
        self.classifier = classifier

    def fit(self, X, y=None):
        self.classifier.fit(X, y)
        return self

    def transform(self, X):
        return self.classifier.predict_proba(X)

# Define classifiers to use for feature engineering
feature_classifiers = {
    'logistic': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ]),
    'gradient_boosting': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', GradientBoostingClassifier())
    ])
}

# Create feature engineering pipeline with FeatureUnion
feature_union = FeatureUnion([
    (name, ClassifierTransformer(clf)) for name, clf in feature_classifiers.items()
])

# Final classifier
final_classifier = LogisticRegression()

# Final pipeline combining feature union and final classifier
pipeline = Pipeline([
    ('features', feature_union),    # Feature union
    ('classifier', final_classifier)   # Classifier
])

# Fit pipeline with warning suppression
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")  # Ignore warnings
    pipeline.fit(X_train, y_train)

# Predict and evaluate on the test data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)

# Adjust the target names to match the predicted labels
unique_labels = le.inverse_transform(sorted(set(y_test).union(set(y_pred))))
clf_report = classification_report(y_test, y_pred, labels=sorted(set(y_test).union(set(y_pred))), target_names=unique_labels)

print("Confusion Matrix:\n", cm)
print("Classification Report:\n", clf_report)


Test Accuracy: 0.8833
Confusion Matrix:
 [[    1     0     0 ...     0    26     0]
 [    0     0     0 ...     0    11     0]
 [    0     0     0 ...     0    50     0]
 ...
 [    0     0     0 ...     0     1     0]
 [   10     0     0 ...     0 10754     0]
 [    0     0     0 ...     0     8     0]]
Classification Report:
                       precision    recall  f1-score   support

           Australia       0.05      0.03      0.04        32
             Austria       0.00      0.00      0.00        12
             Belgium       0.00      0.00      0.00        51
              Brazil       0.00      0.00      0.00         1
              Canada       0.00      0.00      0.00         2
     Channel Islands       0.00      0.00      0.00        19
              Cyprus       0.00      0.00      0.00         9
      Czech Republic       0.00      0.00      0.00         2
             Denmark       0.00      0.00      0.00        13
                EIRE       0.23      0.02      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Load dataset
file_path = "retail_data.csv"
df = pd.read_csv(file_path)

# Preprocess the data
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d/%m/%Y %H:%M')

# Drop rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Convert CustomerID to string
df['CustomerID'] = df['CustomerID'].astype(str)

# Encode the target variable (Country)
le = LabelEncoder()
df['Country'] = le.fit_transform(df['Country'])

# Select features and target variable
X = df[['Quantity', 'UnitPrice', 'TotalAmount']]
y = df['Country']

# Sample the data for initial testing (optional, remove for full dataset)
X, _, y, _ = train_test_split(X, y, test_size=0.9, random_state=42)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Custom transformer to select a classifier
class ClassifierTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, classifier):
        self.classifier = classifier

    def fit(self, X, y=None):
        self.classifier.fit(X, y)
        return self

    def transform(self, X):
        return self.classifier.predict_proba(X)

# Define classifiers to use for feature engineering
feature_classifiers = {
    'logistic': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ]),
    'gradient_boosting': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', GradientBoostingClassifier())
    ])
}

# Create feature engineering pipeline with FeatureUnion
feature_union = FeatureUnion([
    (name, Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', ClassifierTransformer(clf))
    ])) for name, clf in feature_classifiers.items()
])

# Final classifier (SVM)
final_classifier = SVC(probability=True)

# Combine original features with the outputs from classifiers
class FeatureConcatenator(BaseEstimator, TransformerMixin):
    def __init__(self, feature_union):
        self.feature_union = feature_union
    
    def fit(self, X, y=None):
        self.feature_union.fit(X, y)
        return self
    
    def transform(self, X):
        # Get the outputs from the classifiers
        classifier_features = self.feature_union.transform(X)
        # Concatenate original features with classifier outputs
        combined_features = np.hstack((X, classifier_features))
        return combined_features

# Create pipeline including original features and classifier outputs
pipeline = Pipeline([
    ('features', FeatureConcatenator(feature_union)),  # Combine features
    ('scaler', StandardScaler()),  # Standardize features
    ('classifier', final_classifier)   # Classifier
])

# Fit pipeline
pipeline.fit(X_train, y_train)

# Predict and evaluate on the test data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
unique_labels = le.inverse_transform(sorted(set(y_test).union(set(y_pred))))
clf_report = classification_report(y_test, y_pred, labels=sorted(set(y_test).union(set(y_pred))), target_names=unique_labels)

print("Confusion Matrix:\n", cm)
print("Classification Report:\n", clf_report)

# Print best classifier name based on the highest accuracy score
best_classifier_name = max(feature_classifiers, key=lambda k: accuracy_score(y_test, feature_classifiers[k].predict(X_test)))
print(f"Best Classifier: {best_classifier_name}")


Test Accuracy: 0.8868
Confusion Matrix:
 [[    0     0     0 ...     0    31     0]
 [    0     0     0 ...     0    12     0]
 [    0     0     0 ...     0    50     0]
 ...
 [    0     0     0 ...     0     1     0]
 [    2     0     0 ...     0 10805     0]
 [    0     0     0 ...     0     8     0]]
Classification Report:
                       precision    recall  f1-score   support

           Australia       0.00      0.00      0.00        32
             Austria       0.00      0.00      0.00        12
             Belgium       0.00      0.00      0.00        51
              Brazil       0.00      0.00      0.00         1
              Canada       0.00      0.00      0.00         2
     Channel Islands       0.00      0.00      0.00        19
              Cyprus       0.00      0.00      0.00         9
      Czech Republic       0.00      0.00      0.00         2
             Denmark       0.00      0.00      0.00        13
                EIRE       0.26      0.02      0.0

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Load dataset
file_path = "retail_data.csv"
df = pd.read_csv(file_path)

# Preprocess the data
# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d/%m/%Y %H:%M')

# Drop rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Convert CustomerID to string
df['CustomerID'] = df['CustomerID'].astype(str)

# Encode the target variable (Country)
le = LabelEncoder()
df['Country'] = le.fit_transform(df['Country'])

# Select features and target variable
X = df[['Quantity', 'UnitPrice', 'TotalAmount']]
y = df['Country']

# Sample the data for initial testing (optional, remove for full dataset)
X, _, y, _ = train_test_split(X, y, test_size=0.9, random_state=42)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Custom transformer to select a classifier
class ClassifierTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, classifier):
        self.classifier = classifier

    def fit(self, X, y=None):
        self.classifier.fit(X, y)
        return self

    def transform(self, X):
        return self.classifier.predict_proba(X)

# Define classifiers to use for feature engineering
feature_classifiers = {
    'logistic': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ]),
    'gradient_boosting': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', GradientBoostingClassifier())
    ])
}

# Create feature engineering pipeline with FeatureUnion
feature_union = FeatureUnion([
    (name, Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', ClassifierTransformer(clf))
    ])) for name, clf in feature_classifiers.items()
])

# Final classifier (SVM)
final_classifier = SVC(probability=True)

# Combine original features with the outputs from classifiers
class FeatureConcatenator(BaseEstimator, TransformerMixin):
    def __init__(self, feature_union):
        self.feature_union = feature_union
    
    def fit(self, X, y=None):
        self.feature_union.fit(X, y)
        return self
    
    def transform(self, X):
        # Get the outputs from the classifiers
        classifier_features = self.feature_union.transform(X)
        # Concatenate original features with classifier outputs
        combined_features = np.hstack((X, classifier_features))
        return combined_features

# Create pipeline including original features and classifier outputs
pipeline = Pipeline([
    ('features', FeatureConcatenator(feature_union)),  # Combine features
    ('scaler', StandardScaler()),  # Standardize features
    ('classifier', final_classifier)   # Classifier
])

# Fit pipeline
pipeline.fit(X_train, y_train)

# Transform X_test to get the combined features
X_test_combined = pipeline.named_steps['features'].transform(X_test)

# Number of features after combining
num_combined_features = X_test_combined.shape[1]
print(f"Number of combined features: {num_combined_features}")

# Predict and evaluate on the test data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
unique_labels = le.inverse_transform(sorted(set(y_test).union(set(y_pred))))
clf_report = classification_report(y_test, y_pred, labels=sorted(set(y_test).union(set(y_pred))), target_names=unique_labels)

print("Confusion Matrix:\n", cm)
print("Classification Report:\n", clf_report)

# Print best classifier name based on the highest accuracy score
best_classifier_name = max(feature_classifiers, key=lambda k: accuracy_score(y_test, feature_classifiers[k].predict(X_test)))
print(f"Best Classifier: {best_classifier_name}")


Number of combined features: 114
Test Accuracy: 0.8864
Confusion Matrix:
 [[    0     0     0 ...     0    31     0]
 [    0     0     0 ...     0    12     0]
 [    0     0     0 ...     0    50     0]
 ...
 [    0     0     0 ...     0     1     0]
 [    2     0     0 ...     0 10799     0]
 [    0     0     0 ...     0     8     0]]
Classification Report:
                       precision    recall  f1-score   support

           Australia       0.00      0.00      0.00        32
             Austria       0.00      0.00      0.00        12
             Belgium       0.00      0.00      0.00        51
              Brazil       0.00      0.00      0.00         1
              Canada       0.00      0.00      0.00         2
     Channel Islands       0.00      0.00      0.00        19
              Cyprus       0.00      0.00      0.00         9
      Czech Republic       0.00      0.00      0.00         2
             Denmark       0.00      0.00      0.00        13
                E

In [9]:
import pandas as pd

# Load dataset
file_path = "retail_data.csv"
df = pd.read_csv(file_path)

# Determine the size of the dataset
num_samples = df.shape[0]
num_features = df.shape[1]

print(f"Number of samples (instances): {num_samples}")
print(f"Number of features: {num_features}")


Number of samples (instances): 406829
Number of features: 9


In [10]:
# Assuming you have already fitted the pipeline and predicted on test data

# Transform X_test to get the combined features
X_test_combined = pipeline.named_steps['features'].transform(X_test)

# Original features are the first n columns (where n is the number of original features)
num_original_features = X_test.shape[1]
original_features = X_test_combined[:, :num_original_features]

# Additional features are the remaining columns
additional_features = X_test_combined[:, num_original_features:]

print(f"Original Features shape: {original_features.shape}")
print(f"Additional Features shape: {additional_features.shape}")


Original Features shape: (12205, 3)
Additional Features shape: (12205, 111)
