## **Data Pre-Processing**

In [1]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string

In [27]:
# Define a list of stopwords
stopwords = {
    'the', 'and', 'is', 'in', 'to', 'of', 'it', 'you', 'that', 'a', 'i', 'for', 'on', 'with', 'this',
    'as', 'have', 'but', 'not', 'be'
}

# Other unused stopwords - can be added into stopwords as seen fit
other_stopwords = {
    'were', 'then', "needn't", "wasn't", 'isn', 'just', 'we', 'yourselves', 'more', 'herself', 'wouldn',
    'aren', "mightn't", 'did', 'don', 'ma', "haven't", 'its', 'only', 'too', 'd', "hasn't", 'was', 'myself',
    'shan', 'other', 'our', 'again', 'each', 'yours', 'me', 'some', 'themselves', 'why', 'than', 'do', 'weren',
    'been', 'few', 'having', "she's", 'who', "you're", 'over', "isn't", 'nor', 'am', 'doesn', 'below', "shan't",
    'does', 'so', 'y', "that'll", 'haven', 'mustn', 'these', 'm', 'him', 'are', 'those', 'out', 'most', "you'll",
    'under', 't', 'has', 'up', 'should', 'both', 'no', 'he', 'hadn', 're', 'yourself', 'an', 'during', 'until',
    'between', "don't", 'into', "didn't", 'here', 'shouldn', 'ain', 'll', 'hers', "weren't", 'wasn', 'couldn',
    'which', "couldn't", 'their', 'where', 'how', 'whom', 'same', 'or', 'can', 'didn', 'while', 'at', "hadn't",
    'own', 'needn', 'before', 'such', 'because', 'from', 'if', 'itself', 'after', 'ourselves', "it's", "should've",
    'mightn', "mustn't", 'theirs', 'when', 'all', 'about', 'will', 'being', 'above', 'ours', 'them', 'her', 'there',
    'very', 'hasn', 'down', 'further', "won't", 'his', 'what', 'doing', 'any', "you've", 'now', 'they', 'won', 
    'your', 'through', "aren't", 'she', 'my'
}


# Define regexps
contractions_re = re.compile(r"'|-|\.|!")
symbols_re = re.compile(r"[^A-Za-z0-9\s]")
spaces_re = re.compile(r"\s+")

# Define dictionary for contraction replacements
contractions_dict = {
    "what's": "what is",
    "n't": " not",
    "i'm": "i am", 
    "'re": " are",
    "'ve": " have", 
    "'d": " would",
    "'ll": " will"
}

# Instantiate stemmer
stemmer = SnowballStemmer('english')

def preprocess(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove contractions
    text = contractions_re.sub(lambda match: contractions_dict.get(match.group(0), match.group(0)), text)

    # Remove punctuation and symbols
    text = symbols_re.sub(" ", text)

    # Remove stopwords and stem words
    # TODO: Decision about adding stemming? Results in suffixes and prefixes being removed (e.g. device becomes devic)
    # text = " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])

    # Remove extra spaces
    text = spaces_re.sub(" ", text).strip()

    return text

def tokenize(text):
    # Split text into tokens
    return text.split()

In [40]:
# Load in dataset
# TODO: Do we need quotes?
# dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers'))
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
samples = dataset.data
labels = dataset.target

In [41]:
# Preprocess samples
preprocessed_samples = [preprocess(sample) for sample in samples]

In [43]:
# Tokenize and vectorize data
vectorizer = TfidfVectorizer(tokenizer=tokenize)
vectors = vectorizer.fit_transform(preprocessed_samples)

In [45]:
print("Original sample:\n", samples[3])
print("\nPreprocessed sample:\n", preprocessed_samples[3])
print("\nTokenized sample:\n", tokenized_samples[3])

Original sample:
 
Think!

It's the SCSI card doing the DMA transfers NOT the disks...

The SCSI card can do DMA transfers containing data from any of the SCSI devices
it is attached when it wants to.

An important feature of SCSI is the ability to detach a device. This frees the
SCSI bus for other devices. This is typically used in a multi-tasking OS to
start transfers on several devices. While each device is seeking the data the
bus is free for other commands and data transfers. When the devices are
ready to transfer the data they can aquire the bus and send the data.

On an IDE bus when you start a transfer the bus is busy until the disk has seeked
the data and transfered it. This is typically a 10-20ms second lock out for other
processes wanting the bus irrespective of transfer time.


Preprocessed sample:
 think it s the scsi card doing the dma transfers not the disks the scsi card can do dma transfers containing data from any of the scsi devices it is attached when it wants to an