The goal of this work is to process a text dataset using Neural Networks and Deep Learning
word embedding and data analytics methods and to extract knowledge from it. Prepare a report
for this work and deposit it on moodle.

In this work you will use 20 Newsgroup dataset, but you a free to use any text data (UCI datasets
repository, kaggle, data.gouv.fr, …) informing the Professor.

The work should contains at least the following 4 parts:
1. Analysis of the text dataset
2. Text processing and Transformation
3. Apply di erent Neural Networks (NN) embedding techniques
4. Clustering and/or classi cation on the embedded data
5. Results analysis and visualisation
6. Theoretical formalism

In [26]:
# In this work you will use 20 Newsgroup dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import os
import sys
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
# Analyse the dataset : the context, size, difficulties, detect the objectives.

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

# Load the 20 newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

# the context of the dataset
print(newsgroups_train.target_names)
print(newsgroups_train.data[0])

X_train = newsgroups_train.data
Y_train = newsgroups_train.target

X_test = newsgroups_test.data
Y_test = newsgroups_test.target


['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [28]:
# analyse the size of the dataset
print(len(X_train))
print(len(X_test))

2257
1502


In [29]:
# Text Processing and Transformation
# For this part, you should use scikit-learn and you can follow the tutorial:
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#tutorial-setup

# Assign a fixed integer id to each word occurring in any document of the training set (for instance by building a dictionary from words to integer indices)

# For each document #i, count the number of occurrences of each word w and store it in X[i, j] as the value of feature #j where j is the index of word w in the dictionary
def build_X(data, dictionary):
    X = np.zeros((len(data), len(dictionary)), dtype=np.int)
    for i, doc in enumerate(data):
        for word in doc.split():
            X[i, dictionary[word]] += 1
    return X


def build_dictionary(data):
    dictionary = {}
    for doc in data:
        for word in doc.split():
            if word not in dictionary:
                dictionary[word] = len(dictionary)
    return dictionary


dictionary_train = build_dictionary(X_train)
X_bow_train = build_X(X_train, dictionary_train)

dictionary_test = build_dictionary(X_test)
X_bow_test = build_X(X_test, dictionary_test)

In [30]:
# def tokenize(data, dictionary):
#     vectorizer = CountVectorizer(vocabulary=dictionary)
#     X = vectorizer.fit_transform(data)
#     return X

# X_cv_train = tokenize(X_train, dictionary_train)


# print(X_cv_train)

In [31]:
def tfidf(X):
    transformer = TfidfTransformer()
    X = transformer.fit_transform(X)
    return X


X_tfidf_train = tfidf(X_bow_train)

print(X_tfidf_train)

  (0, 77)	0.1501782300675281
  (0, 76)	0.1501782300675281
  (0, 75)	0.1501782300675281
  (0, 74)	0.103699862561056
  (0, 73)	0.12962948804367816
  (0, 72)	0.1501782300675281
  (0, 71)	0.1501782300675281
  (0, 70)	0.2851886015699785
  (0, 69)	0.11378139920465326
  (0, 68)	0.07044791025673265
  (0, 67)	0.1501782300675281
  (0, 66)	0.11378139920465326
  (0, 65)	0.12962948804367816
  (0, 64)	0.05877826376929799
  (0, 63)	0.1501782300675281
  (0, 62)	0.1501782300675281
  (0, 61)	0.080491850362952
  (0, 60)	0.035036250633131864
  (0, 59)	0.1501782300675281
  (0, 58)	0.09774341765798598
  (0, 57)	0.02293400626289806
  (0, 56)	0.06833288469445191
  (0, 55)	0.1330396798598353
  (0, 54)	0.08004648953595733
  (0, 53)	0.029526882905000593
  :	:
  (2256, 437)	0.04674830046617758
  (2256, 434)	0.10723383440124384
  (2256, 433)	0.07377977334740078
  (2256, 378)	0.06554613623435271
  (2256, 339)	0.06900722582663284
  (2256, 272)	0.04838252718062753
  (2256, 270)	0.037741501674035165
  (2256, 265)	0.04

In [32]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


def build_doc2vec(data):
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data)]
    model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
    return model


def build_doc2vec_X(data, model):
    X = np.zeros((len(data), 5))
    for i, doc in enumerate(data):
        X[i] = model.infer_vector(doc.split())
    return X


model = build_doc2vec(X_bow_train)

X_Doc2Vec = build_doc2vec_X(X_bow_train, model)
print(X_Doc2Vec)

KeyboardInterrupt: 

In [None]:
# BERT model
def build_bert_X(data):
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    X = np.zeros((len(data), 768))
    for i, doc in enumerate(data):
        X[i] = tokenizer.encode(doc, add_special_tokens=True, max_length=768, pad_to_max_length=True)
    return X


X_bert = build_bert_X(X_bow_train)
print(X_bert)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[[ 101. 2013. 1024. ...    0.    0.    0.]
 [ 101. 2013. 1024. ...    0.    0.    0.]
 [ 101. 2013. 1024. ...    0.    0.    0.]
 ...
 [ 101. 2013. 1024. ... 1012. 4895.  102.]
 [ 101. 2013. 1024. ... 5653. 2075.  102.]
 [ 101. 2013. 1024. ...    0.    0.    0.]]
