In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [2]:
cats = ['alt.atheism', 'sci.space', 'misc.forsale', 'rec.autos']

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats, remove=('headers', 'footers', 'quotes'))

In [4]:
group_names = {0:'atheism', 1:'forsale', 2:'autos', 3:'space'}

atheism_docs, forsale_docs, autos_docs, space_docs = [], [], [], []
for idx, label in enumerate(newsgroups_train.target):
    if label == 0:
        atheism_docs.append(idx)
    elif label == 1:
        forsale_docs.append(idx)
    elif label == 2:
        autos_docs.append(idx)
    elif label == 3:
        space_docs.append(idx)
print(len(atheism_docs), len(forsale_docs), len(autos_docs), len(space_docs))

480 585 594 593


In [5]:
len(newsgroups_train.data), len(newsgroups_train.target)

(2252, 2252)

# Vectorize Data

In [6]:
import re
docs = [doc.replace('\n',' ').replace('\t',' ').strip() for doc in newsgroups_train.data]
docs = [re.sub(' {2,}',' ', doc) for doc in docs]

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

vectors = [nlp(doc) for doc in docs]

In [8]:
vectors = [vector.vector for vector in vectors]

# Per Class Average

In [9]:
import numpy as np

def get_class_average(vectors, class_docs):
    class_vectors = np.vstack((vectors[class_docs[0]], vectors[class_docs[1]]))
    for i in range(2,len(class_docs)):
        tmp = vectors[class_docs[i]]
        if tmp.shape[0] > 0:
            class_vectors = np.vstack((class_vectors, tmp))
    return class_vectors, np.mean(atheism_vectors, axis=0)
    
atheism_vectors = np.vstack((vectors[atheism_docs[0]], vectors[atheism_docs[1]]))
#for i in range(2,len(atheism_docs)):
#    tmp = vectors[atheism_docs[i]]
    
#    try:
#        atheism_vectors = np.vstack((atheism_vectors, tmp))
#    except:
#        print(atheism_vectors.shape, tmp.shape)
#atheism_vectors.shape
#np.vstack((vectors[atheism_docs[0]], vectors[atheism_docs[1]])).mean(axis=0).shape

(25, 384) (0,)
(26, 384) (0,)
(56, 384) (0,)
(65, 384) (0,)
(87, 384) (0,)
(135, 384) (0,)
(155, 384) (0,)
(174, 384) (0,)
(257, 384) (0,)
(341, 384) (0,)
(380, 384) (0,)
(408, 384) (0,)


In [11]:
np.mean(atheism_vectors, axis=0).shape

(384,)

In [10]:
atheism_vectors, atheism_avg = get_class_average(vectors, atheism_docs)
forsale_vectors, forsale_avg = get_class_average(vectors, forsale_docs)
autos_vectors, autos_avg = get_class_average(vectors, autos_docs)
space_vectors, space_avg = get_class_average(vectors, space_docs)

# Test

In [11]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats, remove=('headers', 'footers', 'quotes'))

In [13]:
test_vectors = [nlp(doc) for doc in newsgroups_train.data]

In [12]:
group_names = {0:'atheism', 1:'forsale', 2:'autos', 3:'space'}

atheism_test_docs, forsale_test_docs, autos_test_docs, space_test_docs = [], [], [], []
for idx, label in enumerate(newsgroups_train.target):
    if label == 0:
        atheism_test_docs.append(idx)
    elif label == 1:
        forsale_test_docs.append(idx)
    elif label == 2:
        autos_test_docs.append(idx)
    elif label == 3:
        space_test_docs.append(idx)
print(len(atheism_test_docs), len(forsale_test_docs), len(autos_test_docs), len(space_test_docs))

480 585 594 593


In [24]:
from sklearn.metrics.pairwise import cosine_similarity
atheism_to_forsale = cosine_similarity(atheism_avg, forsale_avg)
autos_to_forsale = cosine_similarity(autos_avg, forsale_avg)
print(atheism_to_forsale, autos_to_forsale)

[[0.9999999]] [[0.9999999]]


