In [4]:
import os
import re
import gensim.models.keyed_vectors as KeyedVectors


def clean_text(text):
    """
    Cleans text by removing HTML tags, non-alphanumeric characters,
    extra whitespace, and converting to lowercase.

    Args:
        text (str): The text to clean.

    Returns:
        str: The cleaned text.
    """

    cleaned_text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    cleaned_text = re.sub(r'[^\w\s]', ' ', cleaned_text)  # Replace non-alphanumeric with spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra whitespace
    cleaned_text = cleaned_text.lower()  # Convert to lowercase
    return cleaned_text

def split_into_words(text):
    """
    Splits text into a list of words.

    Args:
        text (str): The text to split.

    Returns:
        List[str]: The list of words.
    """

    return text.split()

def split_into_entences(text):
    """
    Splits text into a list of sentences.

    Args:
        text (str): The text to split.

    Returns:
        List[str]: The list of sentences.
    """

    return re.split(r'[!?.]\s+', text)  # Split on sentence endings

def get_word_vector(word, word2vec_model):
    """
    Looks up the word vector in the provided Word2Vec model.

    Args:
        word (str): The word to get the vector for.
        word2vec_model (gensim.models.keyed_vectors.KeyedVectors): The Word2Vec model.

    Returns:
        np.ndarray: The word vector (if found), or None (if not found).
    """

    try:
        return word2vec_model[word]
    except KeyError:
        # Handle out-of-vocabulary (OOV) words (e.g., assign default vector or skip)
        return None  # You can also return a zero vector here (np.zeros(word2vec_model.vector_size))

def create_review_vectors(text, word2vec_model):
    """
    Creates a list of word vectors for the cleaned text, handling OOV words.

    Args:
        text (str): The cleaned text review.
        word2vec_model (gensim.models.keyed_vectors.KeyedVectors): The Word2Vec model.

    Returns:
        List[np.ndarray]: The list of word vectors for the review.
    """

    cleaned_text = clean_text(text)
    words = cleaned_text.split()
    review_vectors = [get_word_vector(word, word2vec_model) for word in words]
    return review_vectors

# Define paths to positive and negative folders
pos_train_folder = "aclImdb/train/pos"
neg_train_folder = "aclImdb/train/neg"
pos_test_folder = "aclImdb/test/pos"
neg_test_folder = "aclImdb/test/neg"

# Loop through each folder (positive and negative)
for folder in [pos_train_folder, neg_train_folder, pos_test_folder, neg_test_folder]:
    folder_path = os.path.join(os.getcwd(), folder)  # Get full folder path

    # Loop through each text file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Check for .txt files
            file_path = os.path.join(folder_path, filename)

            # Read the text file
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            # Clean the text
            cleaned_text = clean_text(text)

            # Split into words or sentences (choose one)
            # Uncomment the desired option and comment out the other

            # Split into words
            words = split_into_words(cleaned_text)

            # Split into sentences
            # sentences = split_into_sentences(cleaned_text)
            # print(f"Sentences for {filename}: {sentences}")
            # Create review vectors (list of word vectors)
            review_vectors = create_review_vectors(words, word2vec_model)

            # Now you have the review_vectors list for each review, which you can use for sentiment analysis
            # (e.g., by averaging the word vectors, using them as input to a machine learning model, etc.)

print("Processing completed!")


ModuleNotFoundError: No module named 'gensim.models.keyed_vectors'

In [2]:
!pip install gensim

Collecting FuzzyTM>=0.4.0 (from gensim)
  Obtaining dependency information for FuzzyTM>=0.4.0 from https://files.pythonhosted.org/packages/2d/30/074bac7a25866a2807c1005c7852c0139ac22ba837871fc01f16df29b9dc/FuzzyTM-2.0.9-py3-none-any.whl.metadata
  Using cached FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/f0/fe/b899a3d9a18c9a44a35155c79a4c152cb85990ea38ce6ab7ed73e5caa1b9/pyFUME-0.3.1-py3-none-any.whl.metadata
  Using cached pyFUME-0.3.1-py3-none-any.whl.metadata (9.7 kB)
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim)
  Obtaining dependency information for simpful from https://files.pythonhosted.org/packages/9d/0e/aebc2fb0b0f481994179b2ee2b8e6bbf0894d971594688c018375e7076ea/simpful-2.12.0-py3-none-any.whl.metadata
  Using cached simpful-2.12.0-py3-none-any.whl.metadata (4.8 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim)
  U