In [17]:
%reload_ext nb_black

from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split as tts

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Importing Data to be Classified

In [2]:
# regex syntax
# Data is obtained from this kaggle link:
# https://www.kaggle.com/deepak711/4-subject-data-text-classification
PATH = "./subject/physics_biology_geography_accounts subject training data for text classification/train_data_final"

DOC_PATTERN = r".*\.txt"
CAT_PATTERN = r"([\w_\w]+)/.*"

corpus = CategorizedPlaintextCorpusReader(PATH, DOC_PATTERN, cat_pattern=CAT_PATTERN)

<IPython.core.display.Javascript object>

In [3]:
# Splitting up the data by documents and categories
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
categories = [corpus.categories(fileid)[0] for fileid in corpus.fileids()]

<IPython.core.display.Javascript object>

In [4]:
categories

['accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',
 'accounts',

<IPython.core.display.Javascript object>

## Data Cleaning

In [5]:
# Creating a function to help clean up data
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    preprocessed = []
    for doc in docs:
        tokenized = word_tokenize(doc)
        cleaned = [
            stemmer.stem(lemmatizer.lemmatize(token.lower()))
            for token in tokenized
            if not token.lower() in stopwords.words("english")
            if token.isalpha()
        ]
        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)
    return preprocessed

<IPython.core.display.Javascript object>

In [6]:
preprocessed = preprocess(docs)

<IPython.core.display.Javascript object>

In [7]:
X_train, X_test, y_train, y_test = tts(docs, categories, test_size=0.2)

<IPython.core.display.Javascript object>

## Classifying Data with Random Forest

In [8]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("rf", RandomForestClassifier())])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

<IPython.core.display.Javascript object>

## Classification Results

In [9]:
pred = pipe.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

    accounts       1.00      1.00      1.00        57
     biology       0.92      1.00      0.96       135
   geography       1.00      0.25      0.40        16
     physics       0.99      1.00      1.00       150

    accuracy                           0.97       358
   macro avg       0.98      0.81      0.84       358
weighted avg       0.97      0.97      0.96       358



<IPython.core.display.Javascript object>

In [10]:
scores = cross_val_score(pipe, preprocessed, categories, cv=10, scoring="f1_macro")

<IPython.core.display.Javascript object>

In [11]:
scores.mean()

0.8122362258333558

<IPython.core.display.Javascript object>

## Classifying Data with KNearestNeighbors

In [13]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("knn", KNeighborsClassifier())])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Classification Results

In [14]:
pred = pipe.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

    accounts       1.00      1.00      1.00        57
     biology       0.97      0.99      0.98       135
   geography       1.00      0.56      0.72        16
     physics       0.97      1.00      0.99       150

    accuracy                           0.98       358
   macro avg       0.99      0.89      0.92       358
weighted avg       0.98      0.98      0.98       358



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
scores = cross_val_score(pipe, preprocessed, categories, cv=10, scoring="f1_macro")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
scores.mean()

0.8607658937850328

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Classifying Data with SVC

In [18]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("svc", SVC())])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', deg

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Classification Results

In [19]:
pred = pipe.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

    accounts       1.00      1.00      1.00        57
     biology       0.94      1.00      0.97       135
   geography       1.00      0.50      0.67        16
     physics       0.99      0.99      0.99       150

    accuracy                           0.97       358
   macro avg       0.98      0.87      0.91       358
weighted avg       0.98      0.97      0.97       358



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
scores = cross_val_score(pipe, preprocessed, categories, cv=10, scoring="f1_macro")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
scores.mean()

0.8391105524011515

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Conclustions

The models are all able to predict the subjects of the documents pretty accurately, all having an f1 score of 0.80 or higher. The KNearestNeighbor Classifier performed the best with having an f1 score of around 0.86. Some ways to improve this model could be changing hyper parameters on each of the models.