In [3]:
# importing required libraries

import spacy
import os

In [None]:
# installing required pipelines
!python -m spacy download en_core_web_sm

In [5]:
# loading English CPU piepline from spaCy
nlp = spacy.load("en_core_web_sm")

In [6]:
# preprocessing
def pre_pro(text):
    
    # lowercasing the text
    text = text.lower()
    
    # Convert it into spacy format
    doc = nlp(text)
    
    # Inclduing tokens which are not stopwords, punctuations and blanks
    fil_tok = [tok.text for tok in doc if not tok.is_stop and not tok.is_punct and not tok.is_space]
    
    # joining them and converting back to string
    return ' '.join(fil_tok)

In [7]:
# automating reading files, preproessing and saving them
def process_files(inp, op):
    
    # creating output folder
    os.makedirs(op, exist_ok=True)

    for file_name in os.listdir(inp):
        if file_name.endswith('.txt'):
            
            # getting path of input file
            ip_path = os.path.join(inp, file_name)
            
            # getting path of output file
            op_path = os.path.join(op, file_name)
            
            # reading the file with utf8 encoding
            with open(ip_path, 'r', encoding='utf-8') as file:
                text = file.read()
                
            # passing the text to above defined preprocessing function
            processed = pre_pro(text)
            
            # writing the processed text into new file
            with open(op_path, 'w', encoding='utf-8') as file:
                file.write(processed)

In [None]:
# input and output folder paths
input_folder = "text"
output_folder = "preprocessed_text2"

process_files(input_folder, output_folder)

In [14]:
# function to print text after each pre-processing step

def print_preprocessed_files(input_folder, target_files):
    for file_name in target_files:
        file_path = os.path.join(input_folder, file_name)
        if os.path.exists(file_path):
            
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            print(f"\n--- File: {file_name} ---")
            print("\nOriginal Text:\n", text)

            text_low = text.lower()
            print("\nAfter Lowercasing:\n", text_low)

            doc = nlp(text_low)
            tokens = [token.text for token in doc]
            print("\nAfter Tokenization:\n", ' '.join(tokens))

            tok_no_stop = [token for token in tokens if not nlp.vocab[token].is_stop]
            print("\nAfter Removing Stopwords:\n", ' '.join(tok_no_stop))

            tokens_no_punct = [token for token in tok_no_stop if not nlp.vocab[token].is_punct]
            print("\nAfter Removing Punctuations:\n", ' '.join(tokens_no_punct))

            tok_no_blank = [token for token in tokens_no_punct if token.strip()]
            print("\nAfter Removing Blank Spaces:\n", ' '.join(tok_no_blank))

            final_sent = ' '.join(tok_no_blank)
            print("\nFinal Processed Sentence:\n", final_sent)
        else:
            print(f"File {file_name} does not exist in the specified folder.")

input_folder = "text"  # Update this to the correct path
target_files = ['file1.txt', 'file2.txt', 'file4.txt', 'file5.txt', 'file7.txt']

print_preprocessed_files(input_folder, target_files)



--- File: file1.txt ---

Original Text:
 Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.

After Lowercasing:
 loving these vintage springs on my vintage strat. they have a good tension and great stability. if you are floating your bridge and want the most out of your springs than these are the way to go.

After Tokenization:
 loving these vintage springs on my vintage strat . they have a good tension and great stability . if you are floating your bridge and want the most out of your springs than these are the way to go .

After Removing Stopwords:
 loving vintage springs vintage strat . good tension great stability . floating bridge want springs way .

After Removing Punctuations:
 loving vintage springs vintage strat good tension great stability floating bridge want springs way

After Removing Blank Spaces:
 loving vintage springs vintage