# ECE_7500
## Assignment 1
## Jaideep Siva Senthil



In [116]:
#importing packages
import pandas as pd
import regex as re
import string

# Dataset importing

In [117]:
#importing dataset

url = 'https://raw.githubusercontent.com/jaideep-siva/Assignment_1/main/spam_ham_dataset.csv'
df = pd.read_csv(url)


# Data analysis

In [118]:
# Dimensions of the data set
df.shape

(5171, 4)

In [119]:
# sample values from data set
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


> It appears that data frame consists of 4 columns with 2 label coluns, one text column and one column with count.

In [120]:
# understanding the date split
df['label'].value_counts()

label
ham     3672
spam    1499
Name: count, dtype: int64

> There appears to be a class imbalance with hame emails having twice as more data points as spam emails

In [121]:
#checking for nonexistent values
df.isna().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

# Data preprocessing

In [122]:
class data_preprocessing:

  def __init__(self,dataframe):
    self.data = df


  def lower_text(self,text):
    """
        Lowercases all text to make it easier to process.
        :param text: The input text to be converted to lowercase.
        :return: Lowercased text.
        """
    text = text.lower()
    return text

  def remove_subject_re(self,text):
    """
        Removes "Subject:" and "Re:" from emails since they seem to have no inherent value and appear in all emails.
        :param text: The input text, typically an email subject line.
        :return: Text with "Subject:" and "Re:" removed.
        """
    text = re.sub(r'^\s*subject:\s*(re\s*:)?\s*', '', text)
    return text

  def remove_numbers(self,text):
    """
        Removes numbers, as dates and random number values may not contribute meaningful information.
        :param text: The input text possibly containing numbers.
        :return: Text with numbers removed.
        """
    text = re.sub(r'[0-9]+', '', text)
    return text

  def remove_punctuations(self,text):
    """
        Removes punctuations to prevent random characters from becoming tokens, which can make the text corpus too large.
        :param text: The input text that may contain punctuation.
        :return: Text with punctuation removed.
        """
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [123]:
# lowercasing all text in df
df['text'] = df['text'].apply(lambda x: data_preprocessing(df).lower_text(x))

df.head()


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"subject: photoshop , windows , office . cheap ...",1
4,2030,ham,subject: re : indian springs\r\nthis deal is t...,0


In [124]:
# removing Subject: and re: from text
df['text'] = df['text'].apply(lambda x: data_preprocessing(df).remove_subject_re(x))

df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,enron methanol ; meter # : 988291\r\nthis is a...,0
1,2349,ham,"hpl nom for january 9 , 2001\r\n( see attached...",0
2,3624,ham,"neon retreat\r\nho ho ho , we ' re around to t...",0
3,4685,spam,"photoshop , windows , office . cheap . main tr...",1
4,2030,ham,indian springs\r\nthis deal is to book the tec...,0


In [125]:
# applying remove numbers 
df['text'] = df['text'].apply(lambda x: data_preprocessing(df).remove_numbers(x))
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,enron methanol ; meter # : \r\nthis is a follo...,0
1,2349,ham,"hpl nom for january , \r\n( see attached file...",0
2,3624,ham,"neon retreat\r\nho ho ho , we ' re around to t...",0
3,4685,spam,"photoshop , windows , office . cheap . main tr...",1
4,2030,ham,indian springs\r\nthis deal is to book the tec...,0


In [126]:
# applying remove punctuations 
df['text'] = df['text'].apply(lambda x: data_preprocessing(df).remove_punctuations(x))
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,enron methanol meter \r\nthis is a follow u...,0
1,2349,ham,hpl nom for january \r\n see attached file ...,0
2,3624,ham,neon retreat\r\nho ho ho we re around to tha...,0
3,4685,spam,photoshop windows office cheap main trendi...,1
4,2030,ham,indian springs\r\nthis deal is to book the tec...,0


# Feature Extraction

## Class and functions for feature extraction

In [127]:
class feature_extraction:
    """
    Initializes the FeatureExtraction class with a DataFrame containing 'text' and 'label' columns.
    :param data: A DataFrame with 'text' and 'label' columns.
    """
    def __init__(self, data):
        self.data = data
        self.frequency_dict = {} # ceating an empty dict to store the feature vectirs

    def build_frequency_dict(self):
        """
        Builds a frequency dictionary that counts the occurrences of words associated with 'spam' and 'ham' labels in the data.
        """
        for index, row in self.data.iterrows():
            words = row['text'].split()     # split the email into words
            label = row['label']    # get the label of the email 'spam' or 'ham'
            for word in words:
                # Update the frequency count for each word and its associated label.
                if (word, label) in self.frequency_dict:
                    self.frequency_dict[(word, label)] += 1
                else:
                    self.frequency_dict[(word, label)] = 1

    def extract_features(self, text):
        """
        Extracts features from a given text, including bias, spam frequency, and ham frequency.
        :param text: The input text to extract features from.
        :return: A tuple containing bias, spam frequency, and ham frequency.
        """
        words = text.split()    #split the word into texts
        bias = 1    #initializing a default bias of 1
        spam_frequency = 0  #initalizing spam word frequency 
        ham_frequency = 0   #intializing ham word frequency
        for word in words:
            # Retrieve word frequencies from the pre-built frequency dictionary.
            spam_frequency += self.frequency_dict.get((word, 'spam'), 0)
            ham_frequency += self.frequency_dict.get((word, 'ham'), 0)
        return bias, spam_frequency, ham_frequency


## Getting features

In [102]:
feature_extractor.build_frequency_dict()
df['features'] = df['text'].apply(feature_extractor.extract_features)


In [103]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,features
0,605,ham,enron methanol meter \r\nthis is a follow u...,0,"(1, 46320, 136187)"
1,2349,ham,hpl nom for january \r\n see attached file ...,0,"(1, 2815, 16376)"
2,3624,ham,neon retreat\r\nho ho ho we re around to tha...,0,"(1, 635297, 1466796)"
3,4685,spam,photoshop windows office cheap main trendi...,1,"(1, 638, 341)"
4,2030,ham,indian springs\r\nthis deal is to book the tec...,0,"(1, 64303, 165774)"


# Function for train and test split

In [104]:
# importing random function to shuffle the data set
import random

In [105]:
def train_test_split(corpus):
"""
    Splits a corpus into training and test sets.

    :param corpus: The input corpus 
    :return: A tuple containing the training set and the test set.
    """
    
  # intializing split in this case I have chosen to split 70% for training and 30% for testing 
  split = 0.7
  
  #Shuffle the corpus to ensure randomization
  corpus = corpus.sample(frac=1)

  # Finding the index at which the corupus needs to be splot
  split_index = int(split* len(corpus))

  # Split the corpus into training and test sets
  train_set = corpus[:split_index]
  test_set = corpus[split_index:]

  return train_set, test_set


## Checking train and test split

In [106]:
corpus = df
train_set, test_set = train_test_split(corpus)

## Checking train/test split

In [107]:
train_set.shape

(3619, 5)

In [108]:
test_set.shape

(1552, 5)

# Logreg class

In [109]:
import numpy as np


# Extract the training set features 
X = train_set['features'].values

# Reshape the features into a 2D array with a single column 
X = X.reshape(-1, 1)

# Extract labels from the training set labels 
y = train_set['label_num']

# coverting the labels into type numpy
y = np.array(y)

# Extract  testing set features 
x_test = test_set['features'].values

x_test = x_test.reshape(-1,1)

y_test = test_set['label_num']

y_test = np.array(y_test)




In [110]:

class logreg:
  def __init__(self, learning_rate=0.01, epochs=1000):
      """
        Initializes the LogisticRegression class.
        :param learning_rate: Learning rate for gradient descent (default: 0.01).
        :param epochs: Number of training epochs (default: 1000).
        """
      self.learning_rate = learning_rate
      self.epochs = epochs
      self.theta = np.random.rand(3)
      self.costs = []

  def sigmoid(self, Z):
     """
        Calculates the sigmoid activation function.
        :param Z: Input value.
        :return: Sigmoid of Z.
    """
      y_pred = 1 / (1 + np.exp(-Z))
      y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15) # clipping to avoid log(0) which causes an error with the cost function 
      return y_pred


  def cost(self, y_pred, y,N):
        """
        Calculates the logistic regression cost (cross-entropy) function.

        :param y_pred: Predicted value from the sigmoid function
        :param y: True Labels
        :param N: sample size
        :return: Cross-entropy cost.
        """
      cost = -(1 / N) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
      return cost

  def gradient(self, X, y, y_pred,N):
        """
        Calculates the gradient of the cost function.

        :param X: Input features.
        :param y: label
        :param y_pred: Predicted value
        :param N: sample size
        :return: Gradient of the cost.
        """
      gradient = (1 / N) * np.dot(X.T, (y_pred - y))
      return gradient

  def update(self, theta, gradient):
        """
        Updates the model parameters using gradient descent.

        :param theta: calculated vector that determines function
        :param gradient: Gradient desent of the cost function
        :return: Updated theta
        """
      theta -= self.learning_rate * gradient
      return theta

  def fit(self, X, y):
       """
        Fits the logistic regression model to the training data.

        :param X: Input features.
        :param y: Actual labels.
        :return: Trained theta
        """
      N =  len(X) # Number of samples in the training data.
      self.theta = np.random.rand(3)  # assingning 3 random values for theta


      for _ in range(self.epochs):
          total_cost = 0
          for j in range(N):
              arr = (X[j])
              arr_int = np.array([list(x) for x in arr], dtype=int)
              X_j = arr_int.reshape(3,)
              y_j = y[j]
              Z = np.dot(X_j, self.theta)
              y_pred = self.sigmoid(Z)
              cost = self.cost(y_pred, y_j,N)
              gradient = self.gradient(X_j, y_j, y_pred,N)
              self.theta = self.update(self.theta, gradient) 
      return self.theta


  def accuracy(y_test,predicted_y):
        """
        Calculates the accuracy of the model.

        :param y_test: true labels.
        :param predicted_y: Predicted labels.
        :return: Accuracy.
        """
    correct_predictions = 0
    N = len(y_test)
    for i in range(N):
      if y_test[i] == predicted_y[i]:
        correct_predictions += 1
    accuracy = correct_predictions / N
      
    return accuracy

  def confusion_matrix(y_test,predicted_y):
        """
        Calculates the confusion matrix of the testing set

        :param y_test: True labels.
        :param predicted_y: Predicted labels.
        :return: Confusion matrix [TP, TN, FP, FN].
        """
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    N = len(y_test)
      # Calculate TP, TN, FP, and FN
    for i in range(N):
      if y_test[i] == 1 and predicted_y[i] == 1:
        TP += 1
      elif y_test[i] == 0 and predicted_y[i] == 0:
        TN += 1
      elif y_test[i] == 0 and predicted_y[i] == 1:
        FP += 1
      elif y_test[i] == 1 and predicted_y[i] == 0:
        FN += 1

      # Print the confusion matrix
    return [TP,TN,FP,FN]



  def evaluate(x_test,y_test,theta) :
      """
        Evaluates the logistic regression model on the test data.

        :param x_test: Test features.
        :param y_test: True  labels.
        :param theta: From fitting the training model
        :return: Accuracy and confusion matrix.
      """
    predictions = []
    T = len(x_test)
    theta = theta
    for j in range(T):
      arr = (x_test[j])
      arr_int = np.array([list(x) for x in arr], dtype=int)
      X_j = arr_int.reshape(3,)
      y_j = y_test[j]
      Z = np.dot(X_j,theta)
      y_pred = 1 / (1 + np.exp(-Z))
      predictions.append(y_pred)

    predictions =np.array(predictions)
    predictions = np.greater_equal(predictions, 0.5).astype(int)

    acc = accuracy(y_test,predictions)
    c_m = confusion_matrix(y_test,predictions)

    return acc,c_m






In [111]:
# Create an instance of the 'LogisticRegression' class assigned learning rate and epochs
model = logreg(learning_rate =0.01, epochs = 100)

# storing the theta vector from fitting the model to training dataset
theta = model.fit(X,y)

# evaluating the model to on the testing data set and storing the values in metrics
metrics =[]
metrics = evaluate(x_test,y_test,theta)

  y_pred = 1 / (1 + np.exp(-Z))
  y_pred = 1 / (1 + np.exp(-Z))


## Testing Metrics

In [112]:
# printing out the accuracy of the model
acc = metrics[0]
print(f'The accuracy is {(round(acc,2)*100)}%')

#printing the confusion matrix of the model
c_m = metrics[1]
print(f'True Positve: {c_m[0]}')
print(f'True Negative: {c_m[1]}')
print(f'False Positve: {c_m[2]}')
print(f'False negative: {c_m[3]}')



The accuracy is 87.0%
True Positve: 277
True Negative: 1080
False Positve: 10
False negative: 185


# SKlearn packages based classification

In [113]:
# importing packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [114]:
# Load the dataset
url = 'https://raw.githubusercontent.com/jaideep-siva/Assignment_1/main/spam_ham_dataset.csv'
data = pd.read_csv(url)



# Define features and labels 
X = data['text']
y = data['label_num']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [115]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_vectorized = vectorizer.transform(X_test)

In [89]:
# Initialize and train the Logistic Regression model
clf = LogisticRegression(random_state=0)
clf.fit(X_train_vectorized, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [130]:
y_pred = clf.predict(X_test_vectorized)

# Calculate accuracy and print a classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {(round(accuracy,2)*100)}")
print(report)

Accuracy: 98.0
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       742
           1       0.97      0.97      0.97       293

    accuracy                           0.98      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035

