Text Processing \n
Creating Datasets in PyTorch

In [20]:
from flask import Flask, render_template, request, redirect, url_for
import nltk
from nltk.corpus import stopwords
import random

##Data Preprocessing Techniques

In [21]:
import nltk
from nltk.corpus import stopwords

def preprocess_text(file_path):
    """
    Read a text file, convert to lowercase, and remove stop words.

    Args:
        file_path (str): The path to the text file.

    Returns:
        list: A list of processed sentences with stop words removed.
    """
    # Load stop words
    stop_words = set(stopwords.words('english'))

    # Read the file
    with open(file_path, 'r') as file:
        lines = file.readlines()

    processed_lines = []
    for line in lines:
        # Convert to lowercase
        line = line.lower()
        # Remove stop words
        words = line.split()
        filtered_words = [word for word in words if word not in stop_words]
        processed_lines.append(' '.join(filtered_words))

    return processed_lines

In [22]:
import random
from nltk.corpus import wordnet
import nltk

def synonym_replacement(sentence, n):
    """
    Replace n words in the sentence with their synonyms.

    Args:
        sentence (str): The input sentence to augment.
        n (int): The number of words to replace with synonyms.

    Returns:
        str: The augmented sentence with synonyms replaced.
    """
    words = sentence.split()
    num_replacements = min(n, len(words))  # Ensure we don't replace more words than available
    replaced_indices = set()  # Track replaced indices to avoid duplicates

    for _ in range(num_replacements):
        while True:
            word_to_replace = random.choice(words)
            index = words.index(word_to_replace)  # Get the index of the word
            
            if index not in replaced_indices:  # Ensure we don't replace the same word
                synonyms = wordnet.synsets(word_to_replace)
                if synonyms:
                    # Randomly select a synonym from the available options
                    synonym = random.choice(synonyms[0].lemmas()).name()
                    words[index] = synonym  # Replace the word
                    replaced_indices.add(index)  # Mark this index as replaced
                break  # Exit the while loop to select another word

    return ' '.join(words)

def random_word_insertion(sentence, n):
    """
    Insert n random words into the sentence.

    Args:
        sentence (str): The input sentence to augment.
        n (int): The number of random words to insert.

    Returns:
        str: The augmented sentence with random words inserted.
    """
    words = sentence.split()
    for _ in range(n):
        # Randomly select a word to insert
        random_word = random.choice(words)  # You can customize this to select from a broader vocabulary
        insert_position = random.randint(0, len(words))  # Random position to insert the word
        words.insert(insert_position, random_word)  # Insert the word

    return ' '.join(words)

# Example usage
file_path = 'sample.txt'  # Ensure 'sample.txt' exists in the working directory

# Preprocess the text
processed_text = preprocess_text(file_path)

for idx, sentence in enumerate(processed_text):
    print(f"Preprecessed Line {idx + 1}: {sentence}")

# Augment the processed text
augmented_text = []
for sentence in processed_text:
    # Perform synonym replacement
    augmented_sentence = synonym_replacement(sentence, n=2)  # Replace 2 words with synonyms
    # Perform random word insertion
    augmented_sentence = random_word_insertion(augmented_sentence, n=2)  # Insert 2 random words
    augmented_text.append(augmented_sentence)

# Print the augmented text
for idx, sentence in enumerate(augmented_text):
    print(f"Augmented Line {idx + 1}: {sentence}")

Preprecessed Line 1: be, be: question.
Preprecessed Line 2: world's stage, men women merely players.
Preprecessed Line 3: lady doth protest much, methinks.
Preprecessed Line 4: rose name would smell sweet.
Preprecessed Line 5: parting sweet sorrow.
Preprecessed Line 6: all: thine self true.
Preprecessed Line 7: course true love never run smooth.
Preprecessed Line 8: cowards die many times deaths.
Preprecessed Line 9: born great, achieve greatness, greatness thrust upon them.
Preprecessed Line 10: better part valour, discretion.
Augmented Line 1: be, be: question. be: question.
Augmented Line 2: world's stage, men adult_female players. only stage, players.
Augmented Line 3: much, lady protest doth protest much, methinks.
Augmented Line 4: rosebush name would sweet. sweet. smell sweet.
Augmented Line 5: parting Sweet Sweet sorrow. sorrow.
Augmented Line 6: ego all: thine ego true. true.
Augmented Line 7: tally course true love true never tally smooth.
Augmented Line 8: cowards die times 