In [52]:
import pandas as pd
import numpy as np

from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')
english_stop_words = stopwords.words("english")
# print(english_stop_words)

import string
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import itertools

data = pd.read_csv("enronSpamSubset.csv")
train_data, test_data = train_test_split(data)
# data

# print(data.iloc[10]["Body"])
# print(data.iloc[10]["Label"])

# wordpunct_tokenize(data.iloc[10]["Body"])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\const\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
class CLASIFICATORUL_INTELIGENT():
    
    def __init__(self, data, no_classes=2):
        self.data = data
        self.no_classes = no_classes 
        self.prob_map = {i: {} for i in range(no_classes)}
        self.spam_words = []
        self.ham_words = []
    
    
    def prepare_data(self):
        for row_index, row in tqdm(self.data.iterrows()):
            body = row["Body"]
            label = row["Label"]
            words = self.clean_body(body)
            # Daca e spam, il punem la spam, daca nu, la nespam
            if label == 1:
                self.spam_words.append(words)
            else: 
                self.ham_words.append(words)
        self.print_stats()
        self.spam_words = list(itertools.chain.from_iterable(self.spam_words))
        self.ham_words = list(itertools.chain.from_iterable(self.ham_words))
    
    def predict(self, body):
        words = self.clean_body(body)
        ham_log_prob = self._handle_predict(words, 0)
        spam_log_prob = self._handle_predict(words, 1)
        return np.argmax([ham_log_prob, spam_log_prob])
        
    def _handle_predict(self, words, class_index):
        log_probs = []
        for word in words:
            if word in self.prob_map[class_index]:
                proba = self.prob_map[class_index][word]
            else:
                proba = 1e-8
            log_probs.append(np.log(proba))
        return np.sum(log_probs)
    
    def fit(self):
        smoothing_factor = len(np.unique(self.spam_words + self.ham_words))
        self._handle_fit(self.spam_words, 1, smoothing_factor)
        self._handle_fit(self.ham_words, 0, smoothing_factor)
        
    def _handle_fit(self, words, class_index, smoothing_factor):
        from collections import Counter 
        words_counter = Counter(words)
        for word in words_counter:
            self.prob_map[class_index][word] = (words_counter[word] + 1) / (len(words) + smoothing_factor)
    
    def print_stats(self):
        self._stat_helper(self.spam_words, "<SPAM>")
        self._stat_helper(self.ham_words, "<HAM>")
    
    def _stat_helper(self, mat, class_name):
        lens = list(map(lambda x: len(x), mat))
        print(f"FACEM STAT PENTRU: {class_name}")
        print(f"MEDIA DE CUVINTE ESTE: {np.mean(lens)}")
        print("--------------------------------------------------")
        
    # Avem cuvinte 
    def clean_body(self, body):
        words = wordpunct_tokenize(body)
        words = self.make_lower(words)
        words = self.remove_stop_words(words)
        words = self.remove_punctuation(words)
        words = self.replace_numbers(words)
        return words
        
    def make_lower(self, words):
        return [word.lower() for word in words]
    
    def remove_stop_words(self, words):
        return [word for word in words if word not in english_stop_words]
    
    def remove_punctuation(self, words):
        return [word for word in words if word not in string.punctuation]
    
    def replace_numbers(self, words):
        return ["<SUPER_BET>" if word.isdigit() else word for word in words]

In [61]:
clasificatooorul = CLASIFICATORUL_INTELIGENT(data, 2)
clasificatooorul.prepare_data()
clasificatooorul.fit()

10000it [00:13, 738.83it/s]


FACEM STAT PENTRU: <SPAM>
MEDIA DE CUVINTE ESTE: 139.0178
--------------------------------------------------
FACEM STAT PENTRU: <HAM>
MEDIA DE CUVINTE ESTE: 181.0638
--------------------------------------------------


In [40]:
clasificatooorul.spam_words

[['subject',
  'stock',
  'promo',
  'mover',
  'cwtd',
  'urgent',
  'investor',
  'trading',
  'alert',
  'weekly',
  'stock',
  'pick',
  'china',
  'world',
  'trade',
  'corp',
  'ticker',
  'cwtd',
  'breaking',
  'news',
  'china',
  'world',
  'trade',
  'corp',
  'enters',
  'agreement',
  'acquire',
  'majority',
  'stake',
  'ceo',
  'clubs',
  'china',
  'limited',
  'ceo',
  'clubs',
  'tianhe',
  'guangzhou',
  'china',
  'market',
  'wire',
  'apr',
  '<SUPER_BET>',
  '<SUPER_BET>',
  'china',
  'world',
  'trade',
  'corp',
  'otc',
  'bb',
  'cwtd',
  'ob',
  'news',
  'announced',
  'today',
  'entered',
  'agreement',
  'acquired',
  'majority',
  'stake',
  'ceo',
  'clubs',
  'china',
  'limited',
  'ceo',
  'clubs',
  'hong',
  'kong',
  'corporation',
  'authorized',
  'chapter',
  'operate',
  'ceo',
  'clubs',
  'trademarks',
  'greater',
  'china',
  'region',
  'including',
  'prc',
  'hong',
  'kong',
  'taiwan',
  'china',
  'world',
  'trade',
  'corp',
  

In [59]:
clasificatooorul.prob_map[1]

{'subject': 0.00716722800485905,
 'stock': 0.0013025455794133253,
 'promo': 4.7764307669269604e-05,
 'mover': 1.0327417874436672e-05,
 'cwtd': 5.938265277801086e-05,
 'urgent': 0.000176857031099728,
 'investor': 0.00029433140942144515,
 'trading': 0.0003033679000615772,
 'alert': 0.0002091302119573426,
 'weekly': 7.100099788675212e-05,
 'pick': 0.00013554735960198133,
 'china': 0.000415678569446076,
 'world': 0.0008662121742183758,
 'trade': 0.0005331529477677931,
 'corp': 0.00018718444897416467,
 'ticker': 2.581854468609168e-05,
 'breaking': 0.00014071106853919964,
 'news': 0.001123106693844988,
 'enters': 3.872781702913752e-05,
 'agreement': 0.0002478580289864801,
 'acquire': 0.0002000937213172105,
 'majority': 7.100099788675212e-05,
 'stake': 6.0673580012315446e-05,
 'ceo': 0.00017169332216250967,
 'clubs': 2.4527617451787095e-05,
 'limited': 0.0005292801660648794,
 'tianhe': 5.163708937218336e-06,
 'guangzhou': 1.1618345108741256e-05,
 'market': 0.001021123442334926,
 'wire': 8.520

In [71]:
clasificatooorul.predict("Hello")

1

In [77]:
def evalueaza_model(model, test_data):
    from sklearn.metrics import classification_report
    predicted_labels = []
    true_labels = []
    for row_index, row in test_data.iterrows():
        body = row["Body"]
        label = row["Label"]
        true_labels.append(label)
        predicted_labels.append(model.predict(body))
    print(classification_report(true_labels, predicted_labels))
    
evalueaza_model(clasificatooorul, test_data)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1212
           1       1.00      0.99      0.99      1288

    accuracy                           0.99      2500
   macro avg       0.99      0.99      0.99      2500
weighted avg       0.99      0.99      0.99      2500

