# Working with Text Data

In [7]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Types of Data Represented as Strings

## Example Application: Sentiment Analysis of Movie Reviews

> Prenos podatkov s strani: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [1]:
!tree -dL 2 data/aclImdb

[01;34mdata/aclImdb[00m
├── [01;34mtest[00m
│   ├── [01;34mneg[00m
│   └── [01;34mpos[00m
└── [01;34mtrain[00m
    ├── [01;34mneg[00m
    └── [01;34mpos[00m

6 directories


In [2]:
!rm -r data/aclImdb/train/unsup

rm: cannot remove 'data/aclImdb/train/unsup': No such file or directory


In [3]:
from sklearn.datasets import load_files

In [4]:
reviews_train = load_files("data/aclImdb/train/")

# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[1]:\n{}".format(text_train[1]))

type of text_train: <class 'list'>
length of text_train: 25000
text_train[1]:
b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive Decisio

In [5]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [8]:
print(f"Samples per class (training): {np.bincount(y_train)}")

Samples per class (training): [12500 12500]


In [9]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

Number of documents in test data: 25000
Samples per class (test): [12500 12500]


## Representing Text Data as a Bag of Words

<img src="images/bag.png"></img>

### Applying Bag-of-Words to a Toy Dataset

In [None]:
bards_words =["The fool doth think he is wise,",
              "but the the wise man knows himself to be a fool"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(bards_words)

In [None]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

In [None]:
bag_of_words = vect.transform(bards_words)

print("bag_of_words: {}".format(repr(bag_of_words)))

In [None]:
bag_of_words.toarray()

### Bag-of-Words for Movie Reviews

In [None]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)

print(f"X_train:\n{repr(X_train)}")

In [None]:
feature_names = vect.get_feature_names_out()

print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(max_iter=10000), X_train, y_train, cv=5, n_jobs=-1, verbose=1)
print(f"Mean cross-validation accuracy: {np.mean(scores):.2f}")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

In [None]:
X_test = vect.transform(text_test)
print("{:.2f}".format(grid.score(X_test, y_test)))

In [None]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)

print("X_train with min_df: {}".format(repr(X_train)))

In [None]:
feature_names = vect.get_feature_names_out()

print("First 50 features:\n{}".format(feature_names[:50]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 700th feature:\n{}".format(feature_names[::700]))

In [None]:
grid = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print(f"Best cross-validation score: {grid.best_score_:.2f}")

## Stopwords

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

In [None]:
# Specifying stop_words="english" uses the built-in list.
# We could also augment it and pass our own.
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)

X_train = vect.transform(text_train)
print("X_train with stop words:\n{}".format(repr(X_train)))

In [None]:
grid = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print(f"Best cross-validation score: {grid.best_score_:.2f}")

## Rescaling the Data with tf–idf

    tfidf(w, d) = tf log((N + 1) / (Nw + 1)) + 1

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None), LogisticRegression(max_iter=10000))

param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(text_train, y_train)

print(f"Best cross-validation score: {grid.best_score_:.2f}")

In [None]:
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]

# transform the training dataset
X_train = vectorizer.transform(text_train)

# find maximum value for each of the features over the dataset
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

# get feature names
feature_names = np.array(vectorizer.get_feature_names_out())

print(f"Features with lowest tfidf:\n{feature_names[sorted_by_tfidf[:20]]}")
print(f"Features with highest tfidf: \n{feature_names[sorted_by_tfidf[-20:]]}")

In [None]:
sorted_by_idf = np.argsort(vectorizer.idf_)

print(f"Features with lowest idf:\n{feature_names[sorted_by_idf[:100]]}")

## Investigating Model Coefficients

In [None]:
from helpers import tools

tools.visualize_coefficients(grid.best_estimator_.named_steps["logisticregression"].coef_, feature_names, n_top_features=40)

## Bag-of-Words with More Than One Word (n-Grams)

In [None]:
print(f"bards_words:\n{bards_words}")

In [None]:
cv = CountVectorizer(ngram_range=(1, 1)).fit(bards_words)

print(f"Vocabulary size: {len(cv.vocabulary_)}")
print(f"Vocabulary:\n{cv.get_feature_names_out()}")

In [None]:
cv = CountVectorizer(ngram_range=(2, 2)).fit(bards_words)

print(f"Vocabulary size: {len(cv.vocabulary_)}")
print(f"Vocabulary:\n{cv.get_feature_names_out()}")

In [None]:
cv = CountVectorizer(ngram_range=(3, 3)).fit(bards_words)

print(f"Vocabulary size: {len(cv.vocabulary_)}")
print(f"Vocabulary:\n{cv.get_feature_names_out()}")

In [None]:
cv.transform(bards_words).toarray()

In [None]:
cv = CountVectorizer(ngram_range=(1, 3)).fit(bards_words)

print(f"Vocabulary size: {len(cv.vocabulary_)}")
print(f"Vocabulary:\n{cv.get_feature_names_out()}")

In [None]:
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())

# running the grid search takes a long time because of the
# relatively large grid and the inclusion of trigrams

# lahko dodamo triagrame in biagrame samo predolgo traja
param_grid = {"logisticregression__C": [1, 10, 100], "tfidfvectorizer__ngram_range": [(1, 2)]}

grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=2)
grid.fit(text_train, y_train)

print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters:\n{}".format(grid.best_params_))

In [None]:
# extract feature names and coefficients
vect = grid.best_estimator_.named_steps['tfidfvectorizer']

feature_names = np.array(vect.get_feature_names_out())
coef = grid.best_estimator_.named_steps['logisticregression'].coef_
tools.visualize_coefficients(coef, feature_names, n_top_features=40)

## Advanced Tokenization, Stemming, and Lemmatization

In [None]:
import spacy
import nltk

In [None]:
# load spacy's English-language models
en_nlp = spacy.load("en_core_web_sm")
# instantiate nltk's Porter stemmer
stemmer = nltk.stem.PorterStemmer()

# define function to compare lemmatization in spacy with stemming in nltk
def compare_normalization(doc):
    # tokenize document in spacy
    doc_spacy = en_nlp(doc)
    # print lemmas found by spacy
    print("Lemmatization:")
    print([token.lemma_ for token in doc_spacy])
    # print tokens found by Porter stemmer
    print("Stemming:")
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

In [None]:
compare_normalization("Our meeting today was worse than yesterday, I'm scared of meeting the clients tomorrow.")

## Topic Modeling and Document Clustering

### Latent Dirichlet Allocation

In [None]:
vect = CountVectorizer(max_features=10000, max_df=.15)

X = vect.fit_transform(text_train)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=25, random_state=0, n_jobs=-1)

# We build the model and transform the data in one step
# Computing transform takes some time,
# and we can save time by doing both at once
document_topics = lda.fit_transform(X)

In [None]:
lda.components_.shape

In [None]:
# For each topic (a row in the components_), sort the features (ascending)
# Invert rows with [:, ::-1] to make sorting descending
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

# Get the feature names from the vectorizer
feature_names = np.array(vect.get_feature_names_out())

In [None]:
from helpers.tools import print_topics

#Print out the 10 topics:
print_topics(topics=range(10), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10)

In [None]:
lda100 = LatentDirichletAllocation(n_components=100, learning_method="batch", max_iter=25, random_state=0, n_jobs=-1)
document_topics100 = lda100.fit_transform(X)

In [None]:
topics = np.array([7, 16, 24, 25, 28, 36, 37, 45, 51, 53, 54, 63, 89, 97])

sorting = np.argsort(lda100.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names_out())
print_topics(topics=topics, feature_names=feature_names, sorting=sorting, topics_per_chunk=7, n_words=20)

In [None]:
# sort by weight of "music" topic 45
music = np.argsort(document_topics100[:, 45])[::-1]

# print the five documents where the topic is most important
for i in music[:10]:
    # pshow first two sentences
    print(b".".join(text_train[i].split(b".")[:2]) + b".\n")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 10))
topic_names = ["{:>2} ".format(i) + " ".join(words) for i, words in enumerate(feature_names[sorting[:, :2]])]

# two column bar chart:
for col in [0, 1]:
    start = col * 50
    end = (col + 1) * 50
    ax[col].barh(np.arange(50), np.sum(document_topics100, axis=0)[start:end])
    ax[col].set_yticks(np.arange(50))
    ax[col].set_yticklabels(topic_names[start:end], ha="left", va="top")
    ax[col].invert_yaxis()
    ax[col].set_xlim(0, 2000)
    yax = ax[col].get_yaxis()
    yax.set_tick_params(pad=130)
    
plt.tight_layout()
plt.show()