In [3]:
import numpy as np
import os
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [4]:
import json

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder


class FastTextDataLoader:
    """
    This class is designed to load and pre-process data for training a FastText model.

    It takes the file path to a data source containing movie information (synopses, summaries, reviews, titles, genres) as input.
    The class provides methods to read the data into a pandas DataFrame, pre-process the text data, and create training data (features and labels)
    """

    def __init__(self, preprocess, file_path='data/IMDB_crawled.json'):
        """
        Initializes the FastTextDataLoader class with the file path to the data source.

        Parameters
        ----------
        file_path: str
            The path to the file containing movie information.
        """
        self.preprocess = preprocess
        self.file_path = file_path
        self.le = None
        self.mapping = None

    def read_data_to_df(self, should_ignore_empty_genres=True):
        """
        Reads data from the specified file path and creates a pandas DataFrame containing movie information.

        You can use an IndexReader class to access the data based on document IDs.
        It extracts synopses, summaries, reviews, titles, and genres for each movie.
        The extracted data is then stored in a pandas DataFrame with appropriate column names.

        Returns
        ----------
            pd.DataFrame: A pandas DataFrame containing movie information (synopses, summaries, reviews, titles, genres).
        """
        with open(self.file_path, 'r') as f:
            documents = json.loads(f.read())
            f.close()
        data = []
        for doc in tqdm(documents):
            title = doc.get('title', '')
            if title is None:
                title = ''
            synopsis = doc.get('synopsis', [])
            if synopsis is None:
                synopsis = []
            summaries = doc.get('summaries', [])
            if summaries is None:
                summaries = []
            reviews = doc.get('reviews', [])
            if reviews is None:
                reviews = []
            genres = doc.get('genres', [])
            if genres is None:
                continue
            # Check for empty records
            if should_ignore_empty_genres and len(genres) == 0:
                print(f'doc_id={doc["id"]} has None genre!')
                continue
            if title == '' and len(synopsis) == len(summaries) == len(reviews) == 0:
                print(f'doc_id={doc["id"]} is None!')
                continue
            # Preprocess and add to df data
            genres = genres[0]
            data.append({
                'title': self.preprocess(title),
                'synopsis': self.preprocess(' '.join(synopsis)),
                'summaries': self.preprocess(' '.join(summaries)),
                'reviews': self.preprocess(
                    ' '.join(x[0] for x in ([['', '']] if reviews is None or len(reviews) == 0 else reviews))),
                'genres': self.preprocess(genres),
            })
        return pd.DataFrame(data)

    def create_train_data(self):
        """
        Reads data using the read_data_to_df function, pre-processes the text data, and creates training data (features and labels).

        Returns:
            tuple: A tuple containing two NumPy arrays: X (preprocessed text data) and y (encoded genre labels).
        """
        df = self.read_data_to_df()
        self.le = LabelEncoder()
        df['genres'] = self.le.fit_transform(df['genres'])
        self.mapping = dict(zip(range(len(self.le.classes_)), self.le.classes_))
        df['text'] = df['synopsis'] + ' ' + df['summaries'] + ' ' + df['reviews'] + ' ' + df['title']
        x = np.array(df['text'])
        y = np.array(df['genres'])
        return x, y


In [5]:
import fasttext
import re
import string
import math

import unidecode
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from scipy.spatial import distance

def preprocess_text(text, minimum_length=1, stopword_removal=True, stopwords_domain=[], lower_case=True,
                    punctuation_removal=True):
    """
    preprocess text by removing stopwords, punctuations, and converting to lowercase, and also filter based on a min length
    for stopwords use nltk.corpus.stopwords.words('english')
    for punctuations use string.punctuation

    Parameters
    ----------
    text: str
        text to be preprocessed
    minimum_length: int
        minimum length of the token
    stopword_removal: bool
        whether to remove stopwords
    stopwords_domain: list
        list of stopwords to be removed base on domain
    lower_case: bool
        whether to convert to lowercase
    punctuation_removal: bool
        whether to remove punctuations
    """
    if lower_case:
        text = text.lower()
    text = unidecode.unidecode(text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<br\s*/?>', '', text)
    text.strip()
    if punctuation_removal:
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
    stop_words = set(stopwords.words('english'))
    stop_words.update(stopwords_domain)
    if stopword_removal:
        new_text = ''
        lemmatizer = WordNetLemmatizer()
        tokens = pos_tag(word_tokenize(text))
        for token in tokens:
            word, tag = token
            if word not in stopwords_domain and len(word) > minimum_length:
                wntag = tag[0].lower()
                wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
                if not wntag:
                    lemma = word
                else:
                    lemma = lemmatizer.lemmatize(word, wntag)

                new_text = new_text + lemma + ' '
        text = new_text
    return text.strip()


class FastText:
    """
    A class used to train a FastText model and generate embeddings for text data.

    Attributes
    ----------
    method : str
        The training method for the FastText model.
    model : fasttext.FastText._FastText
        The trained FastText model.
    """

    def __init__(self, method='skipgram'):
        """
        Initializes the FastText with a preprocessor and a training method.

        Parameters
        ----------
        method : str, optional
            The training method for the FastText model.
        """
        self.method = method
        self.model = None

    def train(self, texts, text_file_path='data/FastText_data.txt', should_load_data=False):
        """
        Trains the FastText model with the given texts.

        Parameters
        ----------
        texts : list of str
            The texts to train the FastText model.
        """
        if should_load_data:
            all_text = ''
            for text in tqdm(texts):
                all_text += text + '\n'
            with open(text_file_path, 'w', encoding='utf-8') as file:
                file.write(all_text)
                file.close()

        self.model = fasttext.train_unsupervised(text_file_path, model=self.method)
        print("Model trained successfully")

    def get_query_embedding(self, query):
        """
        Generates an embedding for the given query.

        Parameters
        ----------
        query : str
            The query to generate an embedding for.
        tf_idf_vectorizer : sklearn.feature_extraction.text.TfidfVectorizer
            The TfidfVectorizer to transform the query.
        do_preprocess : bool, optional
            Whether to preprocess the query.

        Returns
        -------
        np.ndarray
            The embedding for the query.
        """
        preprocessed_query = preprocess_text(query)
        return self.model.get_sentence_vector(preprocessed_query)

    def analogy(self, word1, word2, word3):
        """
        Perform an analogy task: word1 is to word2 as word3 is to __.

        Args:
            word1 (str): The first word in the analogy.
            word2 (str): The second word in the analogy.
            word3 (str): The third word in the analogy.

        Returns:
            str: The word that completes the analogy.
        """
        # Obtain word embeddings for the words in the analogy
        embedding1 = self.model[word1]
        embedding2 = self.model[word2]
        embedding3 = self.model[word3]

        # Perform vector arithmetic
        v = embedding3 + embedding2 - embedding1

        # Create a dictionary mapping each word in the vocabulary to its corresponding vector
        words = list(self.model.words.copy())

        # Exclude the input words from the possible results
        words = list(set(words).difference([word1, word2, word3]))

        # Find the word whose vector is closest to the result vector
        c_score = math.inf
        chosen_vector = None
        for word in words:
            score = distance.cosine(v, self.model[word])
            if score < c_score:
                c_score = score
                chosen_vector = word
        return chosen_vector

    def save_model(self, path='data/FastText_model.bin'):
        """
        Saves the FastText model to a file.

        Parameters
        ----------
        path : str, optional
            The path to save the FastText model.
        """
        self.model.save_model(path)

    def load_model(self, path="data/FastText_model.bin"):
        """
        Loads the FastText model from a file.

        Parameters
        ----------
        path : str, optional
            The path to load the FastText model.
        """
        self.model = fasttext.load_model(path)

    def prepare(self, dataset, mode, save=False, path='/Users/divar/University/term-8/information-retrieval/imdb-mir-system/Logic/data/FastText_model.bin'):
        """
        Prepares the FastText model.

        Parameters
        ----------
        dataset : list of str
            The dataset to train the FastText model.
        mode : str
            The mode to prepare the FastText model.
        """
        if mode == 'train':
            self.train(dataset)
        if mode == 'load':
            self.load_model(path)
        if save:
            self.save_model(path)


In [7]:
dataloader = FastTextDataLoader(preprocess_text)
X, y = dataloader.create_train_data()
document_labels = list(dataloader.le.inverse_transform(y))
ft_model = FastText()
ft_model.prepare(dataset=None, mode='load', save=False)

  7%|██████████▍                                                                                                                                        | 705/9950 [01:44<10:21, 14.87it/s]

doc_id=tt6731210 has None genre!


 13%|██████████████████▊                                                                                                                               | 1279/9950 [02:44<11:33, 12.51it/s]

doc_id=tt26908364 has None genre!


 16%|██████████████████████▋                                                                                                                           | 1543/9950 [03:14<12:28, 11.23it/s]

doc_id=tt17497130 has None genre!


 16%|███████████████████████▏                                                                                                                          | 1583/9950 [03:19<14:41,  9.49it/s]

doc_id=tt15799564 has None genre!


 19%|███████████████████████████▉                                                                                                                      | 1904/9950 [03:54<12:34, 10.66it/s]

doc_id=tt14299894 has None genre!


 21%|███████████████████████████████▎                                                                                                                  | 2132/9950 [04:18<10:15, 12.70it/s]

doc_id=tt0251123 has None genre!


 32%|██████████████████████████████████████████████▊                                                                                                   | 3192/9950 [05:50<04:13, 26.68it/s]

doc_id=tt23765492 has None genre!


 32%|███████████████████████████████████████████████                                                                                                   | 3206/9950 [05:50<03:31, 31.85it/s]

doc_id=tt4432124 has None genre!


 35%|███████████████████████████████████████████████████                                                                                               | 3482/9950 [06:17<11:46,  9.16it/s]

doc_id=tt31637517 has None genre!


 39%|████████████████████████████████████████████████████████▌                                                                                         | 3856/9950 [06:50<07:04, 14.34it/s]

doc_id=tt12963502 has None genre!


 39%|█████████████████████████████████████████████████████████▏                                                                                        | 3901/9950 [06:54<08:10, 12.33it/s]

doc_id=tt29768342 has None genre!


 47%|████████████████████████████████████████████████████████████████████▊                                                                             | 4689/9950 [08:11<06:10, 14.20it/s]

doc_id=tt29867105 has None genre!


 48%|█████████████████████████████████████████████████████████████████████▊                                                                            | 4758/9950 [08:17<06:49, 12.67it/s]

doc_id=tt27140032 has None genre!


 59%|██████████████████████████████████████████████████████████████████████████████████████▊                                                           | 5916/9950 [09:52<06:16, 10.72it/s]

doc_id=tt31107449 has None genre!


 60%|██████████████████████████████████████████████████████████████████████████████████████▉                                                           | 5929/9950 [09:53<04:43, 14.18it/s]

doc_id=tt14962296 has None genre!


 61%|████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 6037/9950 [10:04<03:37, 17.97it/s]

doc_id=tt7985982 has None genre!


 73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 7221/9950 [11:42<01:01, 44.31it/s]

doc_id=tt14549284 has None genre!


 76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 7528/9950 [12:08<02:46, 14.53it/s]

doc_id=tt1546032 has None genre!


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 7984/9950 [12:45<01:28, 22.33it/s]

doc_id=tt31181287 has None genre!


 81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 8016/9950 [12:48<02:01, 15.90it/s]

doc_id=tt14223750 has None genre!


 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                          | 8176/9950 [13:00<00:22, 79.96it/s]

doc_id=tt24950660 has None genre!
doc_id=tt31183803 has None genre!
doc_id=tt31123081 has None genre!
doc_id=tt21418340 has None genre!
doc_id=tt20215356 has None genre!


 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 8573/9950 [13:27<01:41, 13.59it/s]

doc_id=tt27863908 has None genre!


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                 | 8733/9950 [13:40<00:42, 28.74it/s]

doc_id=tt15073568 has None genre!
doc_id=tt31450459 has None genre!
doc_id=tt23783950 has None genre!


 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 8846/9950 [13:49<00:42, 25.78it/s]

doc_id=tt11481690 has None genre!


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 9348/9950 [14:22<00:23, 25.79it/s]

doc_id=tt0199066 has None genre!


 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 9615/9950 [14:43<00:10, 30.47it/s]

doc_id=tt28529522 has None genre!


 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 9837/9950 [14:55<00:03, 33.61it/s]

doc_id=tt30982784 has None genre!


 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 9862/9950 [14:57<00:05, 15.34it/s]

doc_id=tt0319762 has None genre!


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9950/9950 [15:04<00:00, 11.00it/s]


In [8]:
embeddings = []
with tqdm(X) as pbar:
    for x in pbar:
        embeddings.append(ft_model.get_query_embedding(x))
embeddings = np.array(embeddings)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9916/9916 [14:53<00:00, 11.10it/s]


In [10]:
base_path = '/Users/divar/University/term-8/information-retrieval/imdb-mir-system/Logic/data/clustering'
np.save(f'{base_path}/embeddings.npy', embeddings)
np.save(f'{base_path}/labels.npy', y)
with open(f'{base_path}/document_labels.json', 'w') as f:
    f.write(json.dumps(document_labels))
    f.close()