In [104]:
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# Deciding which embedding to use
possible_word_vectors = (50, 100, 200, 300)
word_vectors = possible_word_vectors[0]

file_name = f'glove.6B.{word_vectors}d.txt'

filepath = '../data/'
pretrained_embedding = os.path.join(filepath, file_name)

    
embeddings_index = {}
with open(pretrained_embedding, "rb") as f:
    for line in f:
        values = line.split()
        word = values[0].decode("utf-8") 
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [261]:
# Getting the data
cats = ['alt.atheism', 'sci.space']
# After try a multiclass example
# cats = ['alt.atheism', 'talk.religion.misc',
#         'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)

X_train = newsgroups_train['data']
y_train = newsgroups_train['target']

X_test = newsgroups_test['data']
y_test = newsgroups_test['target']

In [266]:
class EmbeddingVectorizer(object):
    """
    Follows the scikit-learn API
    Transform each document in the average
    of the embeddings of the words in it
    """

    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 50
        
    def fit(self, X, y):
        return self

    def transform(self, X):
        """
        Find the embedding vector for each word in the dictionary
        and take the mean for each document
        """
        # Renaming it just to make it more understandable 
        documents = X
        embedded_docs = []
        for document in documents:
            # For each document
            # Consider the mean of all the embeddings
            embedded_document = []
            for words in document:

                for w in words:
                    if w in self.word2vec:
                        embedded_word = self.word2vec[w]
                    else:
                        embedded_word = np.zeros(self.dim)
                    embedded_document.append(embedded_word
            embedded_docs.append(np.mean(embedded_document, axis=0))

        return embedded_docs


In [263]:
# Creating the embedding
e = EmbeddingVectorizer(embeddings_index)
X_train_embedded = e.transform(X_train)

# Train the classifier
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.fit(X_train_embedded, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [264]:
X_test_embedded = e.transform(X_test)
predictions = rf.predict(X_test_embedded)

In [268]:
print('AUC score: ', roc_auc_score(predictions, y_test))
confusion_matrix(predictions, y_test)

AUC score:  0.7405204936377006


array([[224,  88],
       [ 95, 306]])