In [2]:
import re
import numpy as np
import matplotlib.pyplot as plt
import scipy

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize 

import keras
from keras.datasets import mnist
from keras.utils import to_categorical
from keras import models
from keras import layers

from sklearn import linear_model, tree, neighbors, model_selection, svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import forest


def StemmedTokenizer():
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    return lambda articles: [stemmer.stem(t) for t in word_tokenize(articles)]

def LemmaTokenizer():
    wnl = WordNetLemmatizer()
    return lambda articles: [wnl.lemmatize(t) for t in word_tokenize(articles)]

def pipeline(train, test):
    pipe = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', tokenizer = StemmedTokenizer())), 
        ('dtc', tree.DecisionTreeClassifier())
    ])
    pipe = pipe.fit(train.data, train.target)
    predicted = pipe.predict(test.data)
    print('Pipeline acc: %s' % np.mean(predicted == test.target))

def grid_search_method(train_data, train_target, test_data, test_target, model, params):
    gs_clf = GridSearchCV(model, params, n_jobs=-1, verbose=10)
    gs_clf = gs_clf.fit(train_data, train_target)
    print("Best score: %s" % gs_clf.best_score_)
    print("Best param: %s" % gs_clf.best_params_)
    print('Accuracy of best model: %s' % np.mean(gs_clf.predict(test_data) == test_target))

def print_history(history):
    epochs = range(len(history.history['acc']))
    plt.plot(epochs, history.history['loss'], 'bo', label='Training loss')
    plt.plot(epochs, history.history['val_loss'], 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [20]:
# parameters = {
#         'n_neighbors': [1, 15],  
#         'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute')
#     }
# grid_search_method(train_tfidf, train.target, test_tfidf, test.target, forest, parameters)

# parameters = {
#         'C': [1., 3., 7., 10., 15., 20., 25., 30., 36., 41.],
#         'gamma': [.01, .05, .1, .15, .2, .25, .3, .4, .8, 1., 1.5, 2., 4.]
#     }
# grid_search_method(train_tfidf, train.target, test_tfidf, test.target, svc, parameters)

In [5]:
# train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers'))
# test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers'))
train = fetch_20newsgroups(subset='train', shuffle=True)
test = fetch_20newsgroups(subset='test', shuffle=True)
cleanup_regex = r'(_)|(\d[0-9.]*)'
vectorizer = TfidfVectorizer(stop_words='english',
#                                     tokenizer = StemmedTokenizer(),
                                    tokenizer = LemmaTokenizer(),
#                                     min_df = 3,
#                                     max_df = 0.8,
                                    use_idf = True,
                                    lowercase = True)
#                                     preprocessor = lambda x: re.sub(cleanup_regex, ' ', x.lower()))
vectorizer.fit(train.data)
train_tfidf = vectorizer.transform(train.data)
test_tfidf = vectorizer.transform(test.data)
print("Shape: ", train_tfidf.shape)

Shape:  (11314, 179367)


In [18]:
forestClass = forest.RandomForestClassifier(n_estimators = 900, max_depth=100, n_jobs=-1)
forestClass.fit(train_tfidf, train.target)
print('Random forest acc: %s' % np.mean(forestClass.predict(test_tfidf) == test.target))

Random forest acc: 0.7728359001593202


In [65]:
decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(train_tfidf, train.target)
print('Tree acc: %s' % np.mean(decision_tree.predict(test_tfidf) == test.target))

AVC acc: 0.5665161975570897


In [14]:
svc = svm.SVC(C=20., gamma=0.1)  # acc 0.83 21. 0.1
svc.fit(train_tfidf, train.target)
print('AVC acc: %s' % np.mean(svc.predict(test_tfidf) == test.target))

AVC acc: 0.8317843866171004


In [11]:
knn = neighbors.KNeighborsClassifier(16, leaf_size = 2, n_jobs=-1, metric = 'minkowski')
knn.fit(train_tfidf, train.target)
print('KNN acc: %s' % np.mean(knn.predict(test_tfidf) == test.target))

KNN acc: 0.6975570897503983


In [6]:
train_labels_oneHot = to_categorical(train.target)
test_labels_oneHot = to_categorical(test.target)
network = models.Sequential()
network.add(layers.Dense(128, activation=keras.layers.LeakyReLU(alpha=0.3), input_shape=(train_tfidf.shape[1],)))
network.add(layers.Dense(len(train.target_names), activation='softmax'))
network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])
history = network.fit( train_tfidf,
                      train_labels_oneHot,
                      epochs=12,
                      batch_size=128,
                      validation_data=(test_tfidf, test_labels_oneHot))
print("Final accuracy: ", network.evaluate(test_tfidf, test_labels_oneHot, verbose=0)[1] )
print_history(history)


  identifier=identifier.__class__.__name__))


Train on 11314 samples, validate on 7532 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

KeyboardInterrupt: 