In [0]:
import os
import json
import matplotlib.pyplot as plt
import numpy as np
import nltk 

from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import reuters, stopwords
from string import punctuation

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


In [0]:
fname_x = os.path.join("WOS11967/X.txt")
fname_y = os.path.join("WOS11967/Y.txt")

In [0]:
with open(fname_x, encoding="utf-8") as f:
    content_x = f.readlines()
with open(fname_y) as fk:
    content_y = fk.readlines()

## Lemmatisation

In [0]:
wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
    
def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

def preprocess_text(text):
    # Input: str, i.e. document/sentence
    # Output: list(str) , i.e. list of lemmas
    return [word for word in lemmatize_sent(text) 
            if word not in stoplist_combined
            and word.isalpha() ]
         

In [0]:
stopwords_json_en = set(json.load(open('en.json')))
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)
stopwords_time = set(["--","jan","january","feb","february","mar","march","apr","april","may","jun","june","jul","july",
             "aug","august","sept","september","oct","october","nov","november","dec","december"])
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct, stopwords_time)   

In [0]:
label = np.matrix(content_y, dtype=int)
label = np.transpose(label)
print(label.shape)
np.random.seed(7)
train_text, test_text, y_train, y_test = train_test_split(content_x, label, test_size=0.2, random_state=4)

(11967, 1)


In [0]:
y_train = np.array([int(y) for y in y_train])
y_test = np.array([int(y) for y in y_test])

In [0]:
X_train_tokens = [preprocess_text(text) for text in train_text]
X_test_tokens = [preprocess_text(text) for text in test_text]

## TF-IDF

In [0]:
def const(tmp):
    return tmp

In [0]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=const, preprocessor=const, min_df=3, max_df = 0.9)
x_train_tfidf = tfidf_vectorizer.fit_transform(X_train_tokens).toarray()
x_test_tfidf = tfidf_vectorizer.transform(X_test_tokens).toarray()

## Learning

In [0]:
def grid_search(train_x, train_y, test_x, test_y, parameters, pipeline):
    grid_search_tune = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring='accuracy', cv=5, verbose=1)
    grid_search_tune.fit(train_x, train_y)

    print("Best parameters set:")
    print(grid_search_tune.best_estimator_.steps)

    print("Best score: %0.4f" % grid_search_tune.best_score_)
    print("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    predictions = best_clf.predict(test_x)
    print(accuracy_score(test_y, predictions))
    print(classification_report(test_y, predictions))
    return best_clf

## Naive Bayes

In [0]:
%%capture output
pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', MultinomialNB(fit_prior=True, class_prior=None)),
            ])
parameters = {
                'vect__max_df': [0.3, 0.5, 0.75, 0.9, None],
                'vect__min_df': [1, 3, 5],
                'vect__max_features': [3000, 5000, None],
                'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'clf__alpha': [1e-1, 1e-3],
} 
model_bayes_bag = grid_search(X_train_tokens , y_train, X_test_tokens, y_test, parameters, pipeline)

In [0]:
output.show()

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best parameters set:
[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=None, min_df=1,
                ngram_range=(1, 3),
                preprocessor=<function const at 0x7f19869a84d0>,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f19869a84d0>, vocabulary=None)), ('clf', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))]
Best score: 0.7612
Applying best classifier on test data:
0.7560568086883876
              precision    recall  f1-score   support

           0       0.81      0.72      0.76        72
           1       0.63      0.63      0.63        57
           2       0.76      0.85      0.80        52
           3       0.96      0.85

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 22.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 39.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 62.7min
[Parallel(n_jobs=-1)]: Done 1350 out of 1350 | elapsed: 68.3min finished


In [0]:
pipeline = Pipeline([
                ('vect', TfidfVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', MultinomialNB(fit_prior=True, class_prior=None)),
            ])
model_bayes_tfidf = grid_search(X_train_tokens , y_train, X_test_tokens, y_test, parameters, pipeline)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 39.4min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 61.7min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 89.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 121.6min
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed: 134.7min finished


Best parameters set:
[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.3, max_features=None,
                min_df=5, ngram_range=(1, 3), norm='l2',
                preprocessor=<function const at 0x7f7c845419e0>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f7c845419e0>, use_idf=True,
                vocabulary=None)), ('clf', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))]
Best score: 0.7593
Applying best classifier on test data:
0.7518796992481203
              precision    recall  f1-score   support

           0       0.78      0.68      0.73        72
           1       0.67      0.58      0.62        57
           2       0.76      0.85      0.80        52
           3     

**ACCURACY WITH NAIVE BAYES: 75.2%**

## SVM

In [0]:
pipeline = Pipeline([
                ('tfidf', CountVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', LinearSVC()),
            ])
parameters = {
                'tfidf__max_df': [0.3, 0.5, 0,75, 0.9],
                'tfidf__min_df': [3, None],
                'tfidf__max_features': [3000, 5000, None],
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                "clf__C": [0.01, 0.1, 1],
                "clf__class_weight": ['balanced', None],
} 
model_svc_count = grid_search(X_train_tokens , y_train, X_test_tokens, y_test, parameters, pipeline)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 38.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 61.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 88.3min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 121.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 158.6min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 179.5min finished


Best parameters set:
[('tfidf', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.3, max_features=None, min_df=3,
                ngram_range=(1, 3),
                preprocessor=<function const at 0x7f7c845419e0>,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f7c845419e0>, vocabulary=None)), ('clf', LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0))]
Best score: 0.8336
Applying best classifier on test data:
0.841687552213868
              precision    recall  f1-score   support

           0       0.87      0.82      0.84        72
           1       0.83      0.77

In [0]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(tokenizer=const, preprocessor=const)),
                ('clf', LinearSVC()),
            ])
parameters = {
                'tfidf__max_df': [0.3, 0.5, 0,75, 0.9],
                'tfidf__min_df': [3, None],
                'tfidf__max_features': [3000, 5000, None],
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                "clf__C": [0.01, 0.1, 1],
                "clf__class_weight": ['balanced', None],
} 
model_svc_tfidf = grid_search(X_train_tokens , y_train, X_test_tokens, y_test, parameters, pipeline)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 21.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 38.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 63.5min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 94.5min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 131.3min
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed: 146.4min finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=3, ngram_range=(1, 3), norm='l2',
                preprocessor=<function const at 0x7f19869a84d0>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function const at 0x7f19869a84d0>, use_idf=True,
                vocabulary=None)), ('clf', LinearSVC(C=1, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0))]
Best score: 0.8413
Applying best classifier on test data:
0.8454469507101086
              precision    recall  f1-score   support

   

**ACCURACY WITH SVM: 84.5%**