In [1]:
import os
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer

In [2]:
def stem(stemmer, sentence):
    '''
        Given a stemmer and a sentence, stem each word in the sentence and return the sentence.
    '''
    words = sentence.split()
    words = [stemmer.stem(w) for w in words]
    return ' '.join(words)

def clean(text):
    '''
        Remove all special characters from the text. Only keep the characters from a-z and whitespaces.
    '''
    return re.sub(r'[^a-z\s]', '', text)

In [3]:
# Get all files in the current directory
files = os.listdir()
files

['enron_labels.npy',
 'new_data',
 '~$esis proposal.docx',
 'enron edited.py',
 'clean_data.ipynb',
 '~WRL1209.tmp',
 'enron-spam filter',
 '~WRL0369.tmp',
 'enron_features_matrix.npy',
 'enron-spamfilter.py',
 '.ipynb_checkpoints',
 'dict_enron.npy']

In [12]:
# Initialize the stemmer. You can use other stemmers like Lancaster Stemmer as well.
stemmer = PorterStemmer()

# Read all the files and clean them
* involves stemming and special character removal
* The files are assumed to be inside a a folder name stored in FOLDER_NAME 

In [13]:
FOLDER_NAME = 'new_data'
folders = ['ham', 'spam', 'social', 'promotion']
# For each category: Read Data, Clean it, Write it to file
for folder in folders:
    path = f'{FOLDER_NAME}/{folder}'
    files = os.listdir(path)
    print('#Files: ', len(files))

    for file in tqdm(files):
        if file.endswith('.txt'):
            try:
                text = open(f'{path}/{file}', encoding='utf-8').read() # Most files have ASCII encoding
            except Exception as e:
                text = open(f'{path}/{file}', encoding="ISO-8859-1").read() # Some files are not in ASCII format
            finally:
                text = clean(text.lower())
                text = stem(stemmer, text)
                f = open(f'{path}/{file}', 'w')
                f.write(text)
                f.close()

  0%|          | 23/16545 [00:00<01:12, 228.39it/s]

#Files:  16545


100%|██████████| 16545/16545 [01:38<00:00, 168.50it/s]
  0%|          | 26/17157 [00:00<01:07, 255.19it/s]

#Files:  17157


100%|██████████| 17157/17157 [01:21<00:00, 210.64it/s]
  2%|▏         | 25/1341 [00:00<00:05, 246.46it/s]

#Files:  1341


100%|██████████| 1341/1341 [00:05<00:00, 230.02it/s]
  1%|          | 16/1673 [00:00<00:11, 140.02it/s]

#Files:  1673


100%|██████████| 1673/1673 [00:09<00:00, 174.64it/s]
