Some reference here:
https://www.kaggle.com/yufengdev/bbc-text-categorization/notebook

In [1]:
# conda install nltk

In [1]:
import pandas as pd
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/iramzy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
bbc_articles = pd.read_csv("data/bbc-text.csv")

In [3]:
# bbc_articles.head()
bbc_articles['text'][0]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also being built-in to high

In [4]:
bbc_articles['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

##### Tokenization for multiple purposes

In [5]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import nltk
import string

In [6]:
# Fill any blank fields
bbc_articles.category.fillna("", inplace=True)

all_text = bbc_articles.text

words = nltk.word_tokenize(" ".join(all_text.tolist()))

nltk_stopwords = stopwords.words('english') 
unigrams = [i for i in words if i not in nltk_stopwords and i.isalpha() and len(i) > 2]

##### NLTK is not always the best choice though!

##### You can try Spacy next time.

In [7]:
len(unigrams)

458124

##### Wordcloud on unigrams

In [8]:
# !pip install wordcloud
from wordcloud import WordCloud, STOPWORDS

wordcloud2 = WordCloud(
                stopwords=STOPWORDS,
                background_color='white',
                width=2000,
                height=1000
            ).generate(" ".join(unigrams))

ModuleNotFoundError: No module named 'wordcloud'

In [9]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(wordcloud2, interpolation="bilinear", aspect='auto')
plt.axis('off')
plt.show()

NameError: name 'wordcloud2' is not defined

##### Can we learn more from bigrams and trigrams?

In [None]:
bigrams = nltk.bigrams(unigrams)
trigrams = nltk.trigrams(unigrams)

# look at the most common. 
from collections import Counter

bigrams_counter = Counter(bigrams)
trigrams_counter = Counter(trigrams)

In [None]:
unigrams[:10]

In [None]:
print(len(bigrams_counter))
print('\n')
for bigram in bigrams_counter.most_common(20):
    print(bigram)

In [None]:
print(len(trigrams_counter))
print('\n')
for trigram in trigrams_counter.most_common(20):
    print(trigram)

##### Pipeline

Scikit-learn provides a pipeline utility to help automate machine learning workflows. Pipelines are very common in Machine Learning systems, since there is a lot of data to manipulate and many data transformations to apply. So we will utilize pipeline to train every classifier.

https://towardsdatascience.com/multi-label-text-classification-with-scikit-learn-30714b7819c5

##### What do we need in order to define a pipeline?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

import string
import re
import spacy
from spacy.lang.en import English
parser = English()

from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)+ list(STOPWORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]
class CleanTextTransformer(TransformerMixin):
   def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
   def fit(self, X, y=None, **fit_params):
        return self
def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text
def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

##### Alternative cleaning:
https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

In [None]:
count_vect = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,3))
tfidf_vect = TfidfVectorizer(tokenizer=tokenizeText, ngram_range=(1,3))
count_clf = LinearSVC()
tfidf_clf = LinearSVC()

##### How about other classifiers?
https://www.kaggle.com/paul92s/linear-svc-classifier

In [None]:
count_pipe = Pipeline([
    ('cleanText', CleanTextTransformer()),
    ('vectorizer', count_vect),
    ('clf', count_clf)
])

tfidf_pipe = Pipeline([
    ('cleanText', CleanTextTransformer()),
    ('vectorizer', tfidf_vect),
    ('clf', tfidf_clf)
])

In [None]:
from sklearn.model_selection import train_test_split

X = bbc_articles['text']
y = bbc_articles['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2018)

In [None]:
X_train.sample(5)

In [None]:
count_pipe.fit(X_train, y_train)
count_preds = count_pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, count_preds))

In [None]:
count_conf_mat = confusion_matrix(y_test, count_preds)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(count_conf_mat, annot=True, fmt='d',
            xticklabels=bbc_articles['category'].unique(), yticklabels=bbc_articles['category'].unique())
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
tfidf_transform = tfidf_pipe.fit(X_train, y_train)
tfidf_preds = tfidf_pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, tfidf_preds))

In [None]:
tfidf_conf_mat = confusion_matrix(y_test, tfidf_preds)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(tfidf_conf_mat, annot=True, fmt='d',
            xticklabels=bbc_articles['category'].unique(), yticklabels=bbc_articles['category'].unique())
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

##### POS

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
    print("{", token.text, "-->: ", token.lemma_, "}", token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

In [None]:
doc = nlp(bbc_articles.loc[0]['text'])
for token in doc:
    print("{", token.text, "-->: ", token.lemma_, "}", token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

##### NER

In [None]:
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
# doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

In [None]:
doc = nlp(bbc_articles.loc[0]['text'])
print([(X.text, X.label_) for X in doc.ents])

##### Topic Modelling

In [None]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [None]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

In [None]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
bbc_articles = pd.read_csv("data/bbc-text.csv")

In [None]:
import random
text_data = []
for article in bbc_articles['text']:
    tokens = prepare_text_for_lda(article)
    if random.random() > .99:
        text_data.append(tokens)

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('models/corpus.pkl', 'wb'))
dictionary.save('models/dictionary.gensim')

In [None]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('models/model5.gensim')
topics = ldamodel.print_topics(num_words=4)

In [None]:
for topic in topics:
    print(topic)
    print("------------------------------------------------------------------------------------------")

In [None]:
dictionary = gensim.corpora.Dictionary.load('models/dictionary.gensim')
corpus = pickle.load(open('models/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('models/model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

More reading for future:

https://nlpoverview.com

https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/