In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import fasttext
import re
import string
import math
import unidecode
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from scipy.spatial import distance
import numpy as np
import pandas as pd
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


def preprocess_text(text, minimum_length=1, stopword_removal=True, stopwords_domain=[], lower_case=True,
                    punctuation_removal=True):
    """
    preprocess text by removing stopwords, punctuations, and converting to lowercase, and also filter based on a min length
    for stopwords use nltk.corpus.stopwords.words('english')
    for punctuations use string.punctuation

    Parameters
    ----------
    text: str
        text to be preprocessed
    minimum_length: int
        minimum length of the token
    stopword_removal: bool
        whether to remove stopwords
    stopwords_domain: list
        list of stopwords to be removed base on domain
    lower_case: bool
        whether to convert to lowercase
    punctuation_removal: bool
        whether to remove punctuations
    """
    if lower_case:
        text = text.lower()
    text = unidecode.unidecode(text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<br\s*/?>', '', text)
    text.strip()
    if punctuation_removal:
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
    stop_words = set(stopwords.words('english'))
    stop_words.update(stopwords_domain)
    if stopword_removal:
        new_text = ''
        lemmatizer = WordNetLemmatizer()
        tokens = pos_tag(word_tokenize(text))
        for token in tokens:
            word, tag = token
            if word not in stopwords_domain and len(word) > minimum_length:
                wntag = tag[0].lower()
                wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
                if not wntag:
                    lemma = word
                else:
                    lemma = lemmatizer.lemmatize(word, wntag)

                new_text = new_text + lemma + ' '
        text = new_text
    return text.strip()


class FastText:
    """
    A class used to train a FastText model and generate embeddings for text data.

    Attributes
    ----------
    method : str
        The training method for the FastText model.
    model : fasttext.FastText._FastText
        The trained FastText model.
    """

    def __init__(self, method='skipgram'):
        """
        Initializes the FastText with a preprocessor and a training method.

        Parameters
        ----------
        method : str, optional
            The training method for the FastText model.
        """
        self.method = method
        self.model = None

    def train(self, texts, text_file_path='data/FastText_data.txt', should_load_data=False):
        """
        Trains the FastText model with the given texts.

        Parameters
        ----------
        texts : list of str
            The texts to train the FastText model.
        """
        if should_load_data:
            all_text = ''
            for text in tqdm(texts):
                all_text += text + '\n'
            with open(text_file_path, 'w', encoding='utf-8') as file:
                file.write(all_text)
                file.close()

        self.model = fasttext.train_unsupervised(text_file_path, model=self.method)
        print("Model trained successfully")

    def get_query_embedding(self, query):
        """
        Generates an embedding for the given query.

        Parameters
        ----------
        query : str
            The query to generate an embedding for.
        tf_idf_vectorizer : sklearn.feature_extraction.text.TfidfVectorizer
            The TfidfVectorizer to transform the query.
        do_preprocess : bool, optional
            Whether to preprocess the query.

        Returns
        -------
        np.ndarray
            The embedding for the query.
        """
        preprocessed_query = preprocess_text(query)
        return self.model.get_sentence_vector(preprocessed_query)

    def analogy(self, word1, word2, word3):
        """
        Perform an analogy task: word1 is to word2 as word3 is to __.

        Args:
            word1 (str): The first word in the analogy.
            word2 (str): The second word in the analogy.
            word3 (str): The third word in the analogy.

        Returns:
            str: The word that completes the analogy.
        """
        # Obtain word embeddings for the words in the analogy
        embedding1 = self.model[word1]
        embedding2 = self.model[word2]
        embedding3 = self.model[word3]

        # Perform vector arithmetic
        v = embedding3 + embedding2 - embedding1

        # Create a dictionary mapping each word in the vocabulary to its corresponding vector
        words = list(self.model.words.copy())

        # Exclude the input words from the possible results
        words = list(set(words).difference([word1, word2, word3]))

        # Find the word whose vector is closest to the result vector
        c_score = math.inf
        chosen_vector = None
        for word in words:
            score = distance.cosine(v, self.model[word])
            if score < c_score:
                c_score = score
                chosen_vector = word
        return chosen_vector

    def save_model(self, path='data/FastText_model.bin'):
        """
        Saves the FastText model to a file.

        Parameters
        ----------
        path : str, optional
            The path to save the FastText model.
        """
        self.model.save_model(path)

    def load_model(self, path="data/FastText_model.bin"):
        """
        Loads the FastText model from a file.

        Parameters
        ----------
        path : str, optional
            The path to load the FastText model.
        """
        self.model = fasttext.load_model(path)

    def prepare(self, dataset, mode, save=False, path='/Users/divar/University/term-8/information-retrieval/imdb-mir-system/Logic/data/FastText_model.bin'):
        """
        Prepares the FastText model.

        Parameters
        ----------
        dataset : list of str
            The dataset to train the FastText model.
        mode : str
            The mode to prepare the FastText model.
        """
        if mode == 'train':
            self.train(dataset)
        if mode == 'load':
            self.load_model(path)
        if save:
            self.save_model(path)

class FastTextDataLoader:
    """
    This class is designed to load and pre-process data for training a FastText model.

    It takes the file path to a data source containing movie information (synopses, summaries, reviews, titles, genres) as input.
    The class provides methods to read the data into a pandas DataFrame, pre-process the text data, and create training data (features and labels)
    """

    def __init__(self, preprocess, file_path='data/IMDB_crawled.json'):
        """
        Initializes the FastTextDataLoader class with the file path to the data source.

        Parameters
        ----------
        file_path: str
            The path to the file containing movie information.
        """
        self.preprocess = preprocess
        self.file_path = file_path
        self.le = None
        self.mapping = None

    def read_data_to_df(self, should_ignore_empty_genres=True):
        """
        Reads data from the specified file path and creates a pandas DataFrame containing movie information.

        You can use an IndexReader class to access the data based on document IDs.
        It extracts synopses, summaries, reviews, titles, and genres for each movie.
        The extracted data is then stored in a pandas DataFrame with appropriate column names.

        Returns
        ----------
            pd.DataFrame: A pandas DataFrame containing movie information (synopses, summaries, reviews, titles, genres).
        """
        with open(self.file_path, 'r') as f:
            documents = json.loads(f.read())
            f.close()
        data = []
        for doc in tqdm(documents):
            title = doc.get('title', '')
            if title is None:
                title = ''
            synopsis = doc.get('synopsis', [])
            if synopsis is None:
                synopsis = []
            summaries = doc.get('summaries', [])
            if summaries is None:
                summaries = []
            reviews = doc.get('reviews', [])
            if reviews is None:
                reviews = []
            genres = doc.get('genres', [])
            if genres is None:
                continue
            # Check for empty records
            if should_ignore_empty_genres and len(genres) == 0:
                print(f'doc_id={doc["id"]} has None genre!')
                continue
            if title == '' and len(synopsis) == len(summaries) == len(reviews) == 0:
                print(f'doc_id={doc["id"]} is None!')
                continue
            # Preprocess and add to df data
            genres = genres[0]
            data.append({
                'title': self.preprocess(title),
                'synopsis': self.preprocess(' '.join(synopsis)),
                'summaries': self.preprocess(' '.join(summaries)),
                'reviews': self.preprocess(' '.join(x[0] for x in ([['', '']] if reviews is None or len(reviews) == 0 else reviews))),
                'genres': self.preprocess(genres),
            })
        return pd.DataFrame(data)

    def create_train_data(self):
        """
        Reads data using the read_data_to_df function, pre-processes the text data, and creates training data (features and labels).

        Returns:
            tuple: A tuple containing two NumPy arrays: X (preprocessed text data) and y (encoded genre labels).
        """
        df = self.read_data_to_df()
        self.le = LabelEncoder()
        df['genres'] = self.le.fit_transform(df['genres'])
        self.mapping = dict(zip(range(len(self.le.classes_)), self.le.classes_))
        df['text'] = df['synopsis'] + ' ' + df['summaries'] + ' ' + df['reviews'] + ' ' + df['title']
        x = np.array(df['text'])
        y = np.array(df['genres'])
        return x, y

class ReviewLoader:
    def __init__(
            self,
            file_path: str = '/Users/divar/University/term-8/information-retrieval/imdb-mir-system/Logic/data/classification.pkl',
            comments_path: str = '/Users/divar/University/term-8/information-retrieval/imdb-mir-system/Logic/data/comments_training.csv'
    ):
        self.file_path = file_path
        self.comments_path = comments_path
        self.df = None
        self.fasttext_model = FastText()
        self.fasttext_model.prepare(dataset=[], mode='load', save=False)
        self.review_tokens = []
        self.sentiments = []
        self.embeddings = []

    def save_data(self):
        """
        Load the data from the csv file and preprocess the text. Then save the normalized tokens and the sentiment labels.
        Also, load the fasttext model.
        """
        self.df = pd.read_csv(self.comments_path)
        self.df['review'] = self.df['review'].apply(preprocess_text)
        self.df['review_embedding'] = self.df['review'].apply(self.fasttext_model.get_query_embedding)
        mymap = {'positive': 1, 'negative': 0}
        self.df['sentiment'] = self.df['sentiment'].apply(lambda s: mymap.get(s) if s in mymap else s)
        self.df['review_embedding'] = self.df['review_embedding'].apply(list)
        self.df.to_pickle(self.file_path)

    def load_data(self):
        """
        Load the data from the csv file and preprocess the text. Then save the normalized tokens and the sentiment labels.
        Also, load the fasttext model.
        """
        self.df = pd.read_pickle(self.file_path)

    def get_embeddings(self):
        """
        Get the embeddings for the reviews using the fasttext model.
        """
        pass

    def split_data(self, test_data_ratio=0.2):
        """
        Split the data into training and testing data.

        Parameters
        ----------
        test_data_ratio: float
            The ratio of the test data
        Returns
        -------
        np.ndarray, np.ndarray, np.ndarray, np.ndarray
            Return the training and testing data for the embeddings and the sentiments.
            in the order of x_train, x_test, y_train, y_test
        """
        threshold = int(test_data_ratio * self.df.shape[0])
        train_df, test_df = train_test_split(self.df, test_size=threshold, random_state=42)
        return (
            train_df['review_embedding'].values, test_df['review_embedding'].values,
            train_df['sentiment'].values, test_df['sentiment'].values
        )

In [2]:
import pickle

import numpy as np
from tqdm import tqdm


class BasicClassifier:
    def __init__(self, model_path):
        self.model = None
        self.path = model_path

    def fit(self, x, y):
        raise NotImplementedError()

    def predict(self, x):
        raise NotImplementedError()

    def prediction_report(self, x, y):
        raise NotImplementedError()

    def save(self):
        with open(self.path, 'wb') as f:
            pickle.dump(self.model, f)

    def load(self):
        with open(self.path, 'rb') as f:
            self.model = pickle.load(f)

    def get_percent_of_positive_reviews(self, sentences):
        """
        Get the percentage of positive reviews in the given sentences
        Parameters
        ----------
        sentences: list
            The list of sentences to get the percentage of positive reviews
        Returns
        -------
        float
            The percentage of positive reviews
        """
        pass


# Load Review Data

In [3]:
loader = ReviewLoader()
loader.load_data()
x_train, x_test, y_train, y_test = loader.split_data()



# KNN classifier

In [4]:
import numpy as np
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
import scipy.spatial
from collections import Counter
from sklearn.decomposition import PCA

class KnnClassifierData:
    def __init__(self):
        self.k = None
        self.pca = None
        self.X_train = None
        self.y_train = None


class KnnClassifier(BasicClassifier):
    def __init__(self, n_neighbors):
        super().__init__(
            model_path='/Users/divar/University/term-8/information-retrieval/imdb-mir-system/'
                       'Logic/data/classification/knn.pkl'
        )
        self.model = KnnClassifierData()
        self.model.k = n_neighbors
        self.model.pca = PCA(n_components=20)
        self.model.X_train = None
        self.model.y_train = None

    def fit(self, x, y):
        """
        Fit the model using X as training data and y as target values
        use the Euclidean distance to find the k nearest neighbors
        Warning: Maybe you need to reduce the size of X to avoid memory errors

        Parameters
        ----------
        x: np.ndarray
            An m * n matrix - m is count of docs and n is embedding size
        y: np.ndarray
            The real class label for each doc
        Returns
        -------
        self
            Returns self as a classifier
        """
        self.model.pca.fit(np.array(list(x)))
        self.model.X_train = self.model.pca.transform(np.array(list(x)))
        self.model.y_train = y

    def predict(self, x):
        """
        Parameters
        ----------
        x: np.ndarray
            An k * n matrix - k is count of docs and n is embedding size
        Returns
        -------
        np.ndarray
            Return the predicted class for each doc
            with the highest probability (argmax)
        """
        x_reduced = self.model.pca.transform(np.array(list(x)))
        predictions = []
        for i in tqdm(range(len(x_reduced))):
            d = []
            votes = []
            for j in range(len(self.model.X_train)):
                dist = scipy.spatial.distance.euclidean(self.model.X_train[j], x_reduced[i])
                d.append([dist, j])
            d.sort()
            d = d[0:self.model.k]
            for d, j in d:
                votes.append(y_train[j])
            ans = Counter(votes).most_common(1)[0][0]
            predictions.append(ans)
        return predictions

    def prediction_report(self, x, y):
        """
        Parameters
        ----------
        x: np.ndarray
            An k * n matrix - k is count of docs and n is embedding size
        y: np.ndarray
            The real class label for each doc
        Returns
        -------
        str
            Return the classification report
        """
        y_pred = self.predict(x)
        return classification_report(y, y_pred)



In [237]:
classifier = KnnClassifier(n_neighbors=3)
classifier.fit(x_train, y_train)
classifier.save()

In [238]:
result = classifier.prediction_report(x_test[:100], y_test[:100])

  0%|          | 0/100 [00:00<?, ?it/s]

In [239]:
print(result)

              precision    recall  f1-score   support

           0       0.69      0.79      0.74        48
           1       0.78      0.67      0.72        52

    accuracy                           0.73       100
   macro avg       0.73      0.73      0.73       100
weighted avg       0.74      0.73      0.73       100



# Deep NN

In [340]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm


class ReviewDataSet(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.FloatTensor(embeddings)
        self.labels = torch.LongTensor(labels)

        if len(self.embeddings) != len(self.labels):
            raise Exception("Embeddings and Labels must have the same length")

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return self.embeddings[i], self.labels[i]


class MLPModel(nn.Module):
    def __init__(self, in_features=100, num_classes=2):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(in_features, 2048),
            nn.ReLU(),
            nn.Linear(2048, 32),
            nn.ReLU(),
            nn.Linear(32, out_features=num_classes)
        )

    def forward(self, xb):
        return self.network(xb)


class DeepModelClassifier(BasicClassifier):
    def __init__(self, in_features, num_classes, batch_size, num_epochs=50):
        """
        Initialize the model with the given in_features and num_classes
        Parameters
        ----------
        in_features: int
            The number of input features
        num_classes: int
            The number of classes
        batch_size: int
            The batch size of dataloader
        """
        super().__init__(
            model_path='/Users/divar/University/term-8/information-retrieval/imdb-mir-system/'
                       'Logic/data/classification/deep.pkl'
        )
        self.test_loader = None
        self.in_features = in_features
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.model = MLPModel(in_features=in_features, num_classes=num_classes)
        self.best_model = self.model.state_dict()
        self.best_test_accuracy = 0
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
        self.device = 'mps' if torch.backends.mps.is_available else 'cpu'
        self.device = 'cuda' if torch.cuda.is_available() else self.device
        self.model.to(self.device)
        print(f"Using device: {self.device}")

    def fit(self, x, y):
        """
        Fit the model on the given train_loader and test_loader for num_epochs epochs.
        You have to call set_test_dataloader before calling the fit function.
        Parameters
        ----------
        x: np.ndarray
            The training embeddings
        y: np.ndarray
            The training labels
        Returns
        -------
        self
        """
        train_dataset = ReviewDataSet(x, y)
        self.train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        for epoch in range(self.num_epochs):
            self.model.train()
            correct = 0
            total = 0
            total_loss = 0
            with tqdm(enumerate(self.train_loader), total=len(self.train_loader)) as pbar:
                for i, (embed, label) in pbar:
                    embed = embed.to(self.device)
                    label = label.to(self.device)
                    self.optimizer.zero_grad()
                    output = self.model(embed)
                    loss = self.criterion(output, label)
                    total_loss += loss.item()
                    loss.backward()
                    self.optimizer.step()
                    pred = nn.functional.softmax(output, dim=0).argmax(dim=1)
                    total += len(label)
                    correct += torch.sum(pred == label).item()
            print(
                f'[Train]: epoch={epoch}, avg_acc={100 * (correct / total):.2f}, avg_loss={100 * (total_loss / total):.2f}')
            f1, pred_label, true_label, eval_loss = self._eval_epoch(self.test_loader, self.model)
            test_accuracy = (np.sum(np.array(true_label) == np.array(pred_label)).item()) / len(true_label)
            if test_accuracy > self.best_test_accuracy:
                self.best_test_accuracy = test_accuracy
                self.best_model = self.model.state_dict()
            print(f'[Test]: avg_acc={100 * (test_accuracy):.2f}, avg_loss={100 * (eval_loss / total):.2f}')
        return self

    def predict(self, x):
        """
        Predict the labels on the given test_loader
        Parameters
        ----------
        x: np.ndarray
            The test embeddings
        Returns
        -------
        predicted_labels: list
            The predicted labels
        """
        x = torch.tensor(x).to(self.device)
        output = self.model(x)
        pred = nn.functional.softmax(output, dim=0).argmax(dim=1).cpu().numpy()
        return pred

    def _eval_epoch(self, dataloader: torch.utils.data.DataLoader, model):
        """
        Evaluate the model on the given dataloader. used for validation and test
        Parameters
        ----------
        dataloader: torch.utils.data.DataLoader
        Returns
        -------
        eval_loss: float
            The loss on the given dataloader
        predicted_labels: list
            The predicted labels
        true_labels: list
            The true labels
        f1_score_macro: float
            The f1 score on the given dataloader
        """
        print('Eval Model ...')
        eval_loss = 0
        total = 0
        true_labels = []
        pred_labels = []
        with tqdm(enumerate(self.test_loader), total=len(self.test_loader)) as pbar:
            for i, (embed, label) in pbar:
                embed = embed.to(self.device)
                label = label.to(self.device)
                output = self.model(embed)
                loss = self.criterion(output, label)
                total += len(label)
                eval_loss += loss.item()
                pred_labels.append(nn.functional.softmax(output, dim=0).argmax(dim=1))
                true_labels.append(label)
        pred_labels = list(torch.cat(pred_labels).cpu())
        true_labels = list(torch.cat(true_labels).cpu())
        eval_loss /= total
        f1 = f1_score(true_labels, pred_labels, average='macro')
        return eval_loss, pred_labels, true_labels, f1

    def set_test_dataloader(self, X_test, y_test):
        """
        Set the test dataloader. This is used to evaluate the model on the test set while training
        Parameters
        ----------
        X_test: np.ndarray
            The test embeddings
        y_test: np.ndarray
            The test labels
        Returns
        -------
        self
            Returns self
        """
        test_dataset = ReviewDataSet(X_test, y_test)
        self.test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
        return self

    def prediction_report(self, x, y):
        """
        Get the classification report on the given test set
        Parameters
        ----------
        x: np.ndarray
            The test embeddings
        y: np.ndarray
            The test labels
        Returns
        -------
        str
            The classification report
        """
        self.model.load_state_dict(self.best_model)
        y_pred = self.predict(x)
        return classification_report(y, y_pred, zero_division=0)



In [341]:
classifier = DeepModelClassifier(in_features=100, num_classes=2, batch_size=100)
classifier.set_test_dataloader(np.array(list(x_test)), y_test)
classifier.fit(np.array(list(x_train)), y_train)
classifier.save()

Using device: mps


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=0, avg_acc=78.89, avg_loss=0.47
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=83.89, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=1, avg_acc=83.73, avg_loss=0.36
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=83.90, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=2, avg_acc=84.17, avg_loss=0.34
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=85.01, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=3, avg_acc=84.55, avg_loss=0.34
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=84.63, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=4, avg_acc=84.69, avg_loss=0.34
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=84.38, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=5, avg_acc=85.14, avg_loss=0.33
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=85.07, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=6, avg_acc=85.41, avg_loss=0.33
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=85.48, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=7, avg_acc=85.63, avg_loss=0.32
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=85.90, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=8, avg_acc=86.08, avg_loss=0.32
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.57, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=9, avg_acc=86.33, avg_loss=0.32
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.31, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=10, avg_acc=86.41, avg_loss=0.32
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.76, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=11, avg_acc=86.44, avg_loss=0.32
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=85.20, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=12, avg_acc=86.55, avg_loss=0.32
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.61, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=13, avg_acc=86.56, avg_loss=0.32
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.43, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=14, avg_acc=86.65, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.61, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=15, avg_acc=86.68, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.93, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=16, avg_acc=86.75, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.74, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=17, avg_acc=86.71, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.54, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=18, avg_acc=86.92, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.67, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=19, avg_acc=86.81, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.37, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=20, avg_acc=86.89, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.77, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=21, avg_acc=86.87, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.22, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=22, avg_acc=86.75, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.62, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=23, avg_acc=86.80, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.15, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=24, avg_acc=86.98, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.70, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=25, avg_acc=87.00, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.60, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=26, avg_acc=86.91, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.72, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=27, avg_acc=87.01, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.86, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=28, avg_acc=86.98, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.47, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=29, avg_acc=86.96, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.61, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=30, avg_acc=86.87, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.88, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=31, avg_acc=86.97, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.79, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=32, avg_acc=87.08, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.46, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=33, avg_acc=87.01, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.80, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=34, avg_acc=87.16, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.73, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=35, avg_acc=87.16, avg_loss=0.31
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.84, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=36, avg_acc=87.00, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.75, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=37, avg_acc=87.21, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.65, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=38, avg_acc=87.19, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.49, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=39, avg_acc=87.27, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.92, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=40, avg_acc=87.36, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.59, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=41, avg_acc=87.17, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.60, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=42, avg_acc=87.25, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.55, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=43, avg_acc=87.41, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.41, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=44, avg_acc=87.30, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.77, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=45, avg_acc=87.56, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=87.05, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=46, avg_acc=87.29, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.31, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=47, avg_acc=87.30, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.82, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=48, avg_acc=87.35, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=86.63, avg_loss=0.00


  0%|          | 0/400 [00:00<?, ?it/s]

[Train]: epoch=49, avg_acc=87.37, avg_loss=0.30
Eval Model ...


  0%|          | 0/100 [00:00<?, ?it/s]

[Test]: avg_acc=87.00, avg_loss=0.00


In [343]:
result = classifier.prediction_report(np.array(list(x_test)), y_test)
print(result)

              precision    recall  f1-score   support

           0       0.87      0.88      0.88      4961
           1       0.88      0.87      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



# Naive Bayes

In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

class NaiveBayes(BasicClassifier):
    def __init__(self, count_vectorizer, alpha=1):
        super().__init__(
            model_path='/Users/divar/University/term-8/information-retrieval/imdb-mir-system/'
                       'Logic/data/classification/naivebayes.pkl'
        )
        self.cv = count_vectorizer
        self.num_classes = None
        self.classes = None
        self.number_of_features = None
        self.number_of_samples = None
        self.prior = None
        self.feature_probabilities = None
        self.log_probs = None
        self.alpha = alpha

    def fit(self, x, y):
        """
        Fit the features and the labels
        Calculate prior and feature probabilities

        Parameters
        ----------
        x: np.ndarray
            An m * n matrix - m is count of docs and n is embedding size

        y: np.ndarray
            The real class label for each doc

        Returns
        -------
        self
            Returns self as a classifier
        """
        self.classes = np.unique(y)
        self.num_classes = len(self.classes)
        self.number_of_samples, self.number_of_features = x.shape
        self.prior = []
        for i in range(self.num_classes):
            self.prior.append(sum(y == i) / self.number_of_samples)
        self.feature_probabilities = np.zeros((self.num_classes, self.number_of_features))
        for i in range(self.num_classes):
            x_class = x[np.argwhere(y == i)]
            num_words = np.sum(x_class, axis=0)
            self.feature_probabilities[i, :] = (num_words + self.alpha) / (np.sum(x_class) + self.number_of_features)
        return self

    def predict(self, x):
        """
        Parameters
        ----------
        x: np.ndarray
            An k * n matrix - k is count of docs and n is embedding size
        Returns
        -------
        np.ndarray
            Return the predicted class for each doc
            with the highest probability (argmax)
        """
        n, _ = x.shape
        log_predict_probs = np.zeros((n, self.num_classes))
        for i in range(self.num_classes):
            log_predict_probs[:, i] += np.log(self.prior[i])
            class_feature_prob = np.log(self.feature_probabilities[i, :].reshape(self.number_of_features, 1))
            log_predict_probs[:, i] += (x @ class_feature_prob).squeeze()
        return np.argmax(log_predict_probs, axis=1)

    def prediction_report(self, x, y):
        """
        Parameters
        ----------
        x: np.ndarray
            An k * n matrix - k is count of docs and n is embedding size
        y: np.ndarray
            The real class label for each doc
        Returns
        -------
        str
            Return the classification report
        """
        y_pred = self.predict(x)
        return classification_report(y, y_pred)

    def get_percent_of_positive_reviews(self, sentences):
        """
        You have to override this method because we are using a different embedding method in this class.
        """
        x = self.cv.transform(sentences).toarray()
        pred = self.predict(x)
        return sum(pred) / len(pred)


In [9]:
reviews = loader.df['review'].values[:20000]
sentiments = loader.df['sentiment'].values[:20000]

In [10]:
cv = CountVectorizer()
cv.fit(reviews)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(reviews, sentiments, test_size=0.2, random_state=42)
x_train = cv.transform(x_train).toarray()
x_test = cv.transform(x_test).toarray()

In [12]:
classifier = NaiveBayes(cv)
classifier.fit(x_train, y_train)

<__main__.NaiveBayes at 0x2bcbb72b0>

In [14]:
result = classifier.prediction_report(x_test, y_test)
print(result)

              precision    recall  f1-score   support

           0       0.84      0.87      0.85      2048
           1       0.86      0.82      0.84      1952

    accuracy                           0.85      4000
   macro avg       0.85      0.85      0.85      4000
weighted avg       0.85      0.85      0.85      4000



In [15]:
classifier.save()