In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
sns.set_theme()
import string
import random

## Per tal que la classe que fem vagi bé, determinem que haurà de tenir dues columnes:
- Label (on hi han les variables categoriques de classificació).
- Text on hi haurà el text a classificar.

In [2]:
file_path = '../Dades/SMSSpamCollection'

In [3]:
df = pd.read_csv(file_path, header=None, sep='\t', names=['Label', 'Text'])

In [4]:
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Fem una classe on li passem un dataframe com el que tenim d'exemple i ens realitzi tots els calculs del naive bayes.

### Mètodes principals:
2. Fit
1. Evaluate

- **Soft classification:** probabilitats per cada classe. Tornar dataframe amb la probabilitat de que sigui spam i una altre columna de que sigui ham. (una columna nova per cada variabler categorica que tinguem).
- **Hard classification:** la funció ens retornaria la que té superior probabilitat. dataframe on hi ha la predicció.
<hr/>
- Després podem generalitzar de si passem les etiquetes son ham, spam, content trist, etc.
- Haurem de fer un sentiment analysis classificats amb 6 classes. Hauría de ser un parametre de init i si vols tenen aquest nom. O, que el dataframe ja estigui *tractat* i que ho generi automaticament la classe.

In [5]:
def print_first_n_elements(dictionary, n):
    count = 0
    msg = ""
    for key, value in dictionary.items():
        msg += f"{key}: {value} \n"
        count += 1
        if count == n:
            break
    return msg

class NaiveBayes():

    def __init__(self, train):
        # *1 -> Per facilitat de lectura, no es recomenable fer el "fit" al "init"
        df = self.add_tokenize_col(train)
        self.init_dictionary(df)
        self.fill_dictionary(df)
        self.set_n_words_for_label(df)
        self.set_prior(df)

    # Representation of the object when using the print() function.
    def __repr__(self):
        return '+++ head of test data: \n%s' %(print_first_n_elements(self.dictionary, 3))

    # print the n, first registers of the dictionary
    def head(self, n=5):
        print('+++ first %d elements of test data: \n%s' %(n, print_first_n_elements(self.dictionary, n)))

    # Add the column tokenize to the dataframe
    def add_tokenize_col(self, df):
        df['tokens'] = df.Text.apply(self.tokenize)
        return df

    # Given a message (a group of words), this function returns an array containing each word separately.
    # It processes the message by replacing punctuation with spaces, converting to lowercase, and then splitting into individual words.
    def tokenize(self, message):
        for p in string.punctuation:
            message = message.replace(p, ' ')
        return message.lower().split()

    # Create a dictionary where each unique word is a key, and the corresponding value is another dictionary.
    # The inner dictionary contains all possible labels from the 'Label' column in the DataFrame,
    # and each label is initialized with a counter set to 0.
    def init_dictionary(self, df):
        self.dictionary = { word: {label: 0 for label in df.Label.unique()} for word in list(set(df['tokens'].sum())) }
        # Això és poc eficient, com ho ha fet ella al nbc.py està millor
    
    # Check all 'tokens' columns in every row. Every time a word appears, increase the counter for that word
    # in the current 'label' being processed.
    def fill_dictionary(self, df):
        _ = df.apply(self.count_words, axis = 1)
        del _

    def count_words(self, message):
        for word in message.tokens:
            self.dictionary[word][message.Label] +=1

    # We aim to count the occurrences of each word or token for a specific label ('ham' or 'spam', for example).
    # Iterate through all key-value pairs in the dictionary. The key is 'word' and the value is 'counts'
    # The value or 'counts' is another dictionary representing counts for each label. Ex: 'love' : {'ham': 0, 'spam': 25}.
    # Access all the words and retrieve their counts in the inner dictionary with the specified label to create a list.
    # Finally, apply the sum() function to calculate the total occurrences of a word with the specified label.
    # So, we do it for all the possible labels
    def set_n_words_for_label(self, df):
        self.n_words_for_label = { f'n_{label}': sum(counts[label] for word, counts in self.dictionary.items()) for label in df.Label.unique() }

    # Set all the priors (how many times appears each label in the dataset)
    def set_prior(self, df):
        self.prior = df.Label.value_counts(normalize = True)

    # Given new data, overrides all the train data given on the __init__ method.
    def fit(self, new_train_data):
        self.__init__(new_train_data)

    # Show histogram of frequency of words
    def hist_word_count_plt(self, col='', n=100):
        if col == '':
            col = next(iter(next(iter(self.dictionary.items()))[1].items()))[0]
        try:
            if int(n) <= 0:
                n = 100
        except ValueError:
            n = 100

        random_keys = random.sample(self.dictionary.keys(), n)
        random_records = {key: self.dictionary[key] for key in random_keys}

        df = pd.DataFrame(random_records).T
        plt.figure(figsize=(12, 26))
        sns.barplot(x=df[col], y=df.index, color='red')
        plt.title(f'Frequency of words that are {col}')
        plt.xlabel('Frequency')
        plt.ylabel('Word')
        plt.xticks(fontsize=6)
        plt.show()

    def x_word_likelihood(self, label, word, alpha):
        if word in self.dictionary:
            return (self.dictionary[word][label] + alpha) /(self.n_words_for_label[f'n_{label}'] +(len(self.dictionary) *alpha))
        else:
            return 1

    def classify(self, tokens, alpha):
        posts = { key: value for key, value in self.prior.items() }

        for word in tokens:
            for key, value in posts.items():
                posts[key] *= self.x_word_likelihood(key, word, alpha)

        # Rename the dictionary key with post_ on the beginning
        return { f'post_{key}' : value for key, value in posts.items() }

    def expandir_diccionario(self, row):
        return pd.Series(row['predicted'])

    def obtener_clave_valor_mas_alto(self, dictionary):
        all_zeros = all(value == 0 for value in dictionary.values())
        if all_zeros:
            return '??'

        max_key = max(dictionary, key=dictionary.get)
        return max_key.split('_')[1]

    # *2-> No fer-ho per un dataframe sino per una row i cridar-lo desde fora
    def soft_eval(self, test, alpha=1):
        df = self.add_tokenize_col(test)
        df['predicted'] = df.tokens.apply(lambda x: self.classify(x, alpha))
        df_expandido = pd.concat([df, df.apply(self.expandir_diccionario, axis=1)], axis=1)
        df_expandido = df_expandido.drop('predicted', axis=1)
        return df_expandido

    def hard_eval(self, test, alpha=1):
        df = self.add_tokenize_col(test)
        df['predicted_aux'] = df.tokens.apply(lambda x: self.classify(x, alpha))
        df['predicted'] = df['predicted_aux'].apply(self.obtener_clave_valor_mas_alto)
        df = df.drop('predicted_aux', axis=1)
        return df

    def percentages_eval(self, test, alpha=1):
        df = self.add_tokenize_col(test)
        df = self.hard_eval(df, alpha)
        return pd.concat((df.groupby('Label').predicted.value_counts(), df.groupby('Label').predicted.value_counts(normalize = True)), axis = 1)

### *1 i *2 comentaris que he pres a classe. 

In [6]:
df = pd.read_csv(file_path, header=None, sep='\t', names=['Label', 'Text'])
df.head(2)

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [7]:
X_train, X_test = train_test_split(df, test_size = 0.2, random_state = 2873)

In [8]:
classifier = NaiveBayes(X_train)

In [9]:
classifier.percentages_eval(X_test, alpha=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Label,predicted,Unnamed: 2_level_1,Unnamed: 3_level_1
ham,ham,946,0.994742
ham,spam,4,0.004206
ham,??,1,0.001052
spam,spam,153,0.932927
spam,ham,11,0.067073


## TODO:
- el **percentages_eval()** funciona correctament. S'ha de provar que funcioni bé els altres dos. S'ha de comprovar amb l'altre notebook, que donin el mateix resultat.
- s'ha de ficar comentaris a la part de **eval()**
- s'ha de pasar a module i provar de fer import i que funcionin les gràfiques i tot bé.
- buscar com fer documentació per el metode perque quan poses el classifier.""" ja et surti les possibles opcions que hi ha (de metodes) i quins parametres s'han de passar.
- fer comprovació dels parametres passats, perquè no peti la classe.