In [86]:
from sklearn.datasets import fetch_20newsgroups

In [87]:
data_train = fetch_20newsgroups(
    subset="train",
    shuffle=True,
    random_state=42,
    remove=("headers", "footers", "quotes"),
)

In [88]:
print(type(data_train.data))
print(len(data_train.data))

<class 'list'>
11314


In [89]:
print(repr(data_train.data[0]))

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'


In [90]:
def lowercase(texts):
  return [t.lower() for t in texts]

In [91]:
from string import punctuation
def remove_punctuation(texts):

  list_punc = list(punctuation)

  cleaned_texts = []
  for text in texts:
    for punc in list_punc:
      text = text.replace(punc, "")
    cleaned_texts.append(text)

  
  return cleaned_texts

In [92]:
def remove_hidden_characters(texts):
  cleaned_texts = []

  for text in texts:
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    text = text.replace("\'", "")
    cleaned_texts.append(text)

  return cleaned_texts

In [93]:
def whitespace_removal(texts):
  return [t.strip() for t in texts]

In [94]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kamalrajanisrani/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [95]:
from nltk.tokenize import word_tokenize
word_tokenize("My words are being split up")

['My', 'words', 'are', 'being', 'split', 'up']

In [96]:
def tokenise(texts):
  tokenised_texts = []
  
  for text in texts:
    tokenised_texts.append(word_tokenize(text))

  return tokenised_texts

In [97]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kamalrajanisrani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kamalrajanisrani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [98]:
def remove_stopwords(texts):
    
    from nltk.corpus import stopwords
    cleaned_texts = []
    stop_words = list(stopwords.words('english'))


    for text in texts:
        cleaned_sentences = []
        for word in text:
            if str(word) not in stop_words:
                cleaned_sentences.append(word)
        cleaned_texts.append(cleaned_sentences)

    return cleaned_texts

In [99]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

def stem_texts(texts):
    
    stemmer = PorterStemmer()
    return [[stemmer.stem(word) for word in text] for text in texts]

def lemmatise_texts(texts):
    
    lemmatiser = WordNetLemmatizer()
    return [[lemmatiser.lemmatize(word) for word in text] for text in texts]

In [100]:
def preprocess(texts, reduction='s'):
  
    texts = lowercase(texts)
    texts = remove_punctuation(texts)
    texts = remove_hidden_characters(texts)
    texts = whitespace_removal(texts)
    texts = tokenise(texts)
    texts = remove_stopwords(texts)
    
  # Allow a choice as to whether stem or lemmtise
    if reduction == 's':
        texts = stem_texts(texts)
    else:
        texts = lemmatise_texts(texts)
    return texts

In [101]:
def build_vocabulary(texts):

    vocabulary = {}
    
    for text in texts:
        for word in text:
            if word not in vocabulary:
                vocabulary[word] = len(vocabulary)

    return vocabulary

In [104]:
import numpy as np
def encode(texts, vocabulary=None):
    vocabulary = build_vocabulary(texts) if vocabulary is None else vocabulary
    dataset = []

    for text in texts:
        ds = [0] * len(vocabulary)
        for word in text:
            if word in vocabulary:
                ds[vocabulary[word]] = 1
                
        dataset.append(ds)

    return np.array(dataset), vocabulary
            

In [105]:
train_features, train_vocabulary = encode(preprocess(data_train.data, reduction='s'))

In [106]:
train_targets = data_train.target

In [107]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_features, train_targets)

In [108]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Retrieve the test data
data_test = fetch_20newsgroups(
    subset="test",
    shuffle=True,
    random_state=42,
    remove=("headers", "footers", "quotes"),
)

# Encode the test data
test_features, _ = encode(preprocess(data_test.data, reduction='s'), vocabulary=train_vocabulary)
test_targets = data_test.target

In [109]:
# For stemming
y_pred = clf.predict(test_features)
print(f"Accuracy: {accuracy_score(test_targets, y_pred)}")
print(f"Precision: {precision_score(test_targets, y_pred, average='macro')}")
print(f"Recall: {recall_score(test_targets, y_pred, average='macro')}")

Accuracy: 0.6197557089750398
Precision: 0.6188545610367717
Recall: 0.6093566476537501


In [110]:
train_features, train_vocabulary = encode(preprocess(data_train.data, reduction='l'), vocabulary=None)
test_features, _ = encode(preprocess(data_test.data, reduction='l'), vocabulary=train_vocabulary)

In [111]:
clf = LogisticRegression(random_state=0).fit(train_features, train_targets)

# For lemmatisation
lemma_pred = clf.predict(test_features)
print(f"Accuracy: {accuracy_score(test_targets, lemma_pred)}")
print(f"Precision: {precision_score(test_targets, lemma_pred, average='macro')}")
print(f"Recall: {recall_score(test_targets, lemma_pred, average='macro')}")

Accuracy: 0.6153744025491238
Precision: 0.613371885022185
Recall: 0.6044157604291098
