In [1]:
%reload_ext nb_black

from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split as tts

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score

<IPython.core.display.Javascript object>

## Importing Data to be Classified

In [2]:
# regex syntax
# Data is obtained from this kaggle link:
# https://www.kaggle.com/deepak711/4-subject-data-text-classification
PATH = "./subject/physics_biology_geography_accounts subject training data for text classification/train_data_final"

DOC_PATTERN = r".*\.txt"
CAT_PATTERN = r"([\w_\w]+)/.*"

corpus = CategorizedPlaintextCorpusReader(PATH, DOC_PATTERN, cat_pattern=CAT_PATTERN)

<IPython.core.display.Javascript object>

In [3]:
# Splitting up the data by documents and categories
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
categories = [corpus.categories(fileid)[0] for fileid in corpus.fileids()]

<IPython.core.display.Javascript object>

In [4]:
categories

['accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',

<IPython.core.display.Javascript object>

## Data Cleaning

In [5]:
# Creating a function to help clean up data
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    preprocessed = []
    for doc in docs:
        tokenized = word_tokenize(doc)
        cleaned = [
            stemmer.stem(lemmatizer.lemmatize(token.lower()))
            for token in tokenized
            if not token.lower() in stopwords.words("english")
            if token.isalpha()
        ]
        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)
    return preprocessed

<IPython.core.display.Javascript object>

In [6]:
preprocessed = preprocess(docs)

<IPython.core.display.Javascript object>

## Splitting up Data and Preprocessing

In [7]:
X_train, X_test, y_train, y_test = tts(docs, categories, test_size=0.2)

<IPython.core.display.Javascript object>

In [8]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("rf", RandomForestClassifier())])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

<IPython.core.display.Javascript object>

## Classification Results

In [9]:
pred = pipe.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

    accounts       1.00      1.00      1.00        57
     biology       0.92      1.00      0.96       135
   geography       1.00      0.25      0.40        16
     physics       0.99      1.00      1.00       150

    accuracy                           0.97       358
   macro avg       0.98      0.81      0.84       358
weighted avg       0.97      0.97      0.96       358



<IPython.core.display.Javascript object>

In [10]:
scores = cross_val_score(pipe, preprocessed, categories, cv=10, scoring="f1_macro")

<IPython.core.display.Javascript object>

In [11]:
scores.mean()

0.8122362258333558

<IPython.core.display.Javascript object>