In [1]:
# Import TfidfVectorizer for text feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Import PCA for dimensionality reduction
from sklearn.decomposition import PCA

# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Import SelectKBest and mutual_info_classif for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

# Import train_test_split for splitting data into training and testing sets
from sklearn.model_selection import train_test_split

# Import RandomUnderSampler for undersampling imbalanced data
from imblearn.under_sampling import RandomUnderSampler

# Import tqdm for progress bars
from tqdm.notebook import tqdm

# Import deepcopy for copying objects
from copy import deepcopy

# Import zip_longest for iterating over multiple iterables of different lengths
from itertools import zip_longest

# Import matplotlib.pyplot and seaborn for data visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import seaborn as sb

# Set seaborn theme
sb.set_theme()

import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
class Preprocessing:
    def __init__(self, data) -> None:
        self.data = data

    def _process_data(self):
        """
        convert data into a pandas dataframe
        """
        data_arr = []

        data_records = self.data.split("\n")[:-1]
        for data in data_records:
            label = None
            sample = None
            match data[:3]:
                case "ham":
                    label = "legitimate"
                    sample = data[4:]
                case "spa":
                    label = "spam"
                    sample = data[5:]
                case _:
                    label = "N/A"

            data_arr.append([label, sample])

        data_arr = np.array(data_arr)
        data_label = data_arr[:, 0]
        data_records = data_arr[:, 1]

        return data_records, data_label

    def tfidf_vectorizer(self, records):
        vectorizer = TfidfVectorizer(
            lowercase=True, token_pattern=r"\b[A-Za-z]+\b", norm=None
        )

        records_transformed = vectorizer.fit_transform(records)

        return records_transformed.toarray(), vectorizer.get_feature_names_out()

    def feature_extraction(self, X, n_components=5):
        reduction_pca = PCA(n_components=n_components, whiten=False)
        data_reduced = reduction_pca.fit_transform(X)
        return data_reduced

    def feature_selection(self, df_records, labels, n_components=5):
        feature_selection_model = SelectKBest(mutual_info_classif, k=n_components)
        # make a selection over the best features
        selected_record_features = feature_selection_model.fit_transform(
            df_records, labels
        )

        return selected_record_features, feature_selection_model.get_feature_names_out()

    def random_under_sampling(self, df_records, labels):
        ros = RandomUnderSampler(random_state=18)
        X_resampled, y_resampled = ros.fit_resample(df_records, labels)
        return X_resampled, y_resampled

    def fit_transform(self):
        records, labels = self._process_data()
        records_vectorized, feature_names = self.tfidf_vectorizer(records)

        # one hot encoding labels
        labels = np.array([0 if y == "legitimate" else 1 for y in labels])

        # reducing dimension
        records_dim_reduced = self.feature_extraction(records_vectorized)
        records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

        records_selection, feature_name_selection = self.feature_selection(
            records_vectorized, labels=labels
        )

        # records_selection, labels = self.random_under_sampling(
        #     records_selection, labels
        # )

        # records_dim_reduced, labels = self.random_over_sampling(
        #     records_dim_reduced, labels
        # )

        return (
            pd.DataFrame(records_selection, columns=feature_name_selection),
            pd.DataFrame(records_dim_reduced, columns=[1, 2, 3, 4, 5]),
            labels,
        )

In [None]:
# Initialize variable to store data from file
sms_data_str = None

# Open the file in read mode
with open("SMSSpamCollection") as f:
    # Read the contents of the file and store it in the variable
    sms_data_str = f.read()

In [None]:
# Create an instance of the Preprocessing class with the data from the file
preprocessing = Preprocessing(sms_data_str)

# Call the fit_transform method to preprocess the data
records_selection, records_dim_reduced, labels = preprocessing.fit_transform()

# Split the preprocessed data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    records_selection, labels, test_size=0.2, random_state=24
)

In [None]:
class Chromosome:
    # Define constants for linguistic terms and functions
    LINGUISTIC_TERMS = ["Normal", "Minor", "Moderate", "Serious", "Severe"]
    FUNCTIONS = ["iso-tri", "rect-trap", "gaussian", "sigmoid"]

    def __init__(self) -> None:
        self.variables = None
        self.terms_per_variable = None
        self.is_not = None
        self.cls = None
        self.linguistic_model = {}
        self.fitness = None

    def __repr__(self) -> str:
        # Define a string representation of the chromosome
        s = ""
        tmp = []
        for variable in self.variables:
            tmp.append(
                f"{variable} is {'not ' if self.is_not[variable] else ''}{self.terms_per_variable[variable]}"
            )
        s += " and ".join(tmp)
        s += f" then {self.cls}\n"
        s += f"Fitness: {self.fitness}\n"
        s += "---------------------------------------\n"
        for key, value in self.linguistic_model.items():
            s += f"Variable: {key[0]} - Term: {key[1]} - Function: {value[0]} - Param m: {value[1]} - Param s: {value[2]}\n"
        return s

    def initialize(self):
        # Initialize the chromosome with random values
        self.variables = np.random.choice(
            x_train.columns.to_numpy(),
            size=(np.random.randint(1, 5)),
            replace=False,
        )

        self.terms_per_variable = {
            variable: np.random.choice(Chromosome.LINGUISTIC_TERMS)
            for variable in self.variables
        }

        self.is_not = {
            variable: np.random.choice([True, False]) for variable in self.variables
        }

        self.cls = np.random.choice([0, 1])

        for var in self.variables:
            func = np.random.choice(Chromosome.FUNCTIONS)
            param_m = np.random.normal(-100, 100)
            param_s = np.random.normal(-100, 100)
            self.linguistic_model[(var, self.terms_per_variable[var])] = (
                func,
                param_m,
                param_s,
            )

    def calculate_fitness(self):
        f_class = 0
        f_neg_class = 0

        # Iterate over each row in the training data and compute its gR value
        for idx, row in x_train.reset_index().iterrows():
            gR = self._compute_gR(row)

            # Update the f_class and f_neg_class values based on the class of the row
            if y_train[idx] == self.cls:
                f_class += gR
            else:
                f_neg_class += gR

        # Compute the fitness of the chromosome
        if (f_class == 0) and (f_neg_class == 0):
            self.fitness = 0
        else:
            self.fitness = (f_class - f_neg_class) / (f_class + f_neg_class)

    def _compute_gR(self, x):
        # Compute the gR value for a given row of data
        gR = []

        for var in self.variables:
            # Get the function and parameters for the variable from the linguistic model
            fms = self.linguistic_model[(var, self.terms_per_variable[var])]
            function = fms[0]
            m, s = fms[1], fms[2]

            # Compute the value of the function for the variable
            if function == "iso-tri":
                val = np.max([0, np.min([(x[var] - m + s) / s, (m - x[var] + s) / s])])
            elif function == "rect-trap":
                val = np.max([0, np.min([(x[var] - m + s) / s, 1])])
            elif function == "gaussian":
                val = np.exp(-0.5 * (((x[var] - m) / s) ** 2))
            elif function == "sigmoid":
                val = 1 / (1 + np.exp(-((x[var] - m) / s)))

            # Negate the value if necessary
            gR.append(1 - val if self.is_not[var] else val)

        # Return the minimum gR value
        return np.min(gR)