In [10]:
# Importing required libraries

import spacy
import os

In [11]:
# Installing required pipelines
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.2/12.8 MB 6.6 MB/s eta 0:00:02
     - -------------------------------------- 0.3/12.8 MB 7.0 MB/s eta 0:00:02
     -- ------------------------------------- 0.8/12.8 MB 6.7 MB/s eta 0:00:02
     --- ------------------------------------ 1.0/12.8 MB 7.0 MB/s eta 0:00:02
     ---- ----------------------------------- 1.5/12.8 MB 7.3 MB/s eta 0:00:02
     ----- ---------------------------------- 1.9/12.8 MB 7.4 MB/s eta 0:00:02
     ------ --------------------------------- 2.2/12.8 MB 6.9 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 8.4 MB/s eta 0:00:02
     --------- ------------------------------ 3.1/12.8 MB 7.7 MB/s eta 0:00:02
     ------------ -----------------------


[notice] A new release of pip is available: 23.0.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
# Loading English CPU piepline from spaCy
nlp = spacy.load("en_core_web_sm")

In [13]:
# Preprocessing
def pre_pro(text):
    
    # lowercasing the text
    text = text.lower()
    
    # Convert it into spacy format
    doc = nlp(text)
    
    # Inclduing tokens which are not stopwords, punctuations and blanks
    fil_tok = [tok.text for tok in doc if not tok.is_stop and not tok.is_punct and not tok.is_space]
    
    # joining them and converting back to string
    return ' '.join(fil_tok)

In [14]:
# Automating reading files, preproessing and saving them
def process_files(inp, op):
    
    # creating output folder
    os.makedirs(op, exist_ok=True)

    for file_name in os.listdir(inp):
        if file_name.endswith('.txt'):
            
            # getting path of input file
            ip_path = os.path.join(inp, file_name)
            
            # getting path of output file
            op_path = os.path.join(op, file_name)
            
            # reading the file with utf8 encoding
            with open(ip_path, 'r', encoding='utf-8') as file:
                text = file.read()
                
            # passing the text to above defined preprocessing function
            processed = pre_pro(text)
            
            # writing the processed text into new file
            with open(op_path, 'w', encoding='utf-8') as file:
                file.write(processed)

In [15]:
# input and output folder paths
input_folder = "F:\\IIITD\\Sem 8\\IR\\CSE508_Winter2024_A1_2020513\\text"
output_folder = "F:\\IIITD\\Sem 8\\IR\\CSE508_Winter2024_A1_2020513\\preprocessed_text"

process_files(input_folder, output_folder)

In [22]:
# displaying files 1 to 5 before and after preprocessing
def display(input_folder, output_folder, n):
    for i in range(1, n + 1):
        ip = os.path.join(input_folder, f'file{i}.txt')
        op = os.path.join(output_folder, f'file{i}.txt')

        with open(ip, 'r', encoding='utf-8') as file:
            txt = file.read()

        with open(op, 'r', encoding='utf-8') as file:
            pre_txt = file.read()

        print(f"Text {i}")
        print(txt)
        print("------------------------")
        print(f"Preprocessed_Text {i}")
        print(pre_txt)
        print("------------------------\n")


In [23]:
# Input and output folder paths
input_folder = "F:\\IIITD\\Sem 8\\IR\\CSE508_Winter2024_A1_2020513\\text"
output_folder = "F:\\IIITD\\Sem 8\\IR\\CSE508_Winter2024_A1_2020513\\preprocessed_text"

# Display 5 files
display(input_folder, output_folder,5)

Text 1
Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
------------------------
Preprocessed_Text 1
loving vintage springs vintage strat good tension great stability floating bridge want springs way
------------------------

Text 2
Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too.
------------------------
Preprocessed_Text 2
works great guitar bench mat rugged abuse care care makes organization workspace easier screws wo roll color good
------------------------

Text 3
We use these for everything from our acoustic bass down to our ukuleles. I know there is a smaller model available for ukes, violins, etc.; we haven't yet ordered those, but these will work on smaller instruments i