# Seminar 8. Introduction to Natural Language Processing

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json


from tqdm import tqdm_notebook

# Sentiment Analysis in Russian
(from https://www.kaggle.com/c/sentiment-analysis-in-russian/data)

The goal is to estimate sentiment of news in russian.

## Load data

In [None]:
!wget https://raw.githubusercontent.com/hushchyn-mikhail/hse_se_ml/s08/2020/s08-nlp/Data/train.json

In [None]:
# Load data
# with open('Data/train.json') as json_file:
with open('train.json') as json_file:
    data = json.load(json_file)

In [None]:
# Show example
num = 1 # 100 - pos

print("ID: ",          data[num]["id"], "\n")
print("Text: \n",      data[num]["text"])
print("Sentiment: ",   data[num]["sentiment"], "\n")

## Tokenization and data cleaning

Let's split each text into words (**tokenizations**) and remove all **stop words** and punctuation characters. **Stop words** are words that commonly used in texts and can be ignored losing the texts meaning.

<center><img src="https://github.com/shestakoff/hse_se_ml/blob/master/2020/s08-nlp/img/tokenization.png?raw=1"></center>

In [None]:
import string # for work with strings
import nltk   # Natural Language Toolkit

In [None]:
# get russian stop words
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')

# example of stop words
stop_words[:10]

In [None]:
# punctuation characters
string.punctuation

In [None]:
# define word tokenizer
word_tokenizer = nltk.WordPunctTokenizer()

In [None]:
def process_data(data):
    texts = []
    targets = []

    for item in data:

        # collect nlabels of news
        if item['sentiment'] == 'negative':
            targets.append(0)
        else:
            targets.append(1)

        text_lower = item['text'].lower() # convert words in a text to lower case
        tokens     = word_tokenizer.tokenize(text_lower) # splits the text into tokens (words)

        # remove punct and stop words from tokens
        tokens = [word for word in tokens if (word not in string.punctuation and word not in stop_words)]

        texts.append(tokens) # collect the text tokens

    return texts, targets

In [None]:
# run tokenization and data cleaning
texts, y = process_data(data)

In [None]:
# example
i = 1
print("Label: ", y[i])
print("Tokens: ", texts[i][:5])

## Words normalization

Here we will consider 2 ways of words normalizing: **stemming** and **lemmatization**.

### Stemming

<center><img src="https://github.com/shestakoff/hse_se_ml/blob/master/2020/s08-nlp/img/stem2.svg?raw=1" width="400"></center>

In [None]:
from nltk.stem.snowball import SnowballStemmer

# define stemmer
stemmer = SnowballStemmer("russian")

In [None]:
# example of its work
i = 1
for aword in texts[i][:10]:
    aword_stem = stemmer.stem(aword)
    print("Before: %s, After: %s" % (aword, aword_stem))

### Lemmatization

Lemmatization convert a word to its normal form.

<center><img src="https://github.com/shestakoff/hse_se_ml/blob/master/2020/s08-nlp/img/lemm.png?raw=1" width="400"></center>

In [None]:
# ! pip install pymorphy2

In [None]:
import pymorphy2 # Морфологический анализатор

# define lemmatizer :)
morph = pymorphy2.MorphAnalyzer()

In [None]:
# example of its work
i = 1
for aword in texts[i][:10]:
    aword_norm = morph.parse(aword)[0].normal_form
    print("Before: %s, After: %s" % (aword, aword_norm))

Oscar goes to stemming!

Stemming oscar speach:  Thanks to the academy for this prestigious award! I would like to thank all nlp developers that are lazy to use lematization and do not want to wait for too long. Thank you, thank you very much!  

In [None]:
# apply stemming to all texts
for i in tqdm_notebook(range(len(texts))):           # tqdm_notebook creates the process bar below :)
    text_stemmed = list(map(stemmer.stem, texts[i])) # apply stemming to each word in a text
    texts[i] = ' '.join(text_stemmed)                # unite all stemmed words into a new text

In [None]:
# example
i = 1
print("Label: ",   y[i])
print("Text: \n",  texts[i])

## Split into train and test

In [None]:
#train test_split
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_y, test_y = train_test_split(texts, y, test_size=0.33, random_state=42, stratify = y)

## TF-IDF

TF-IDF measures importance of word in a corpus of documents.

<center><img src="https://github.com/shestakoff/hse_se_ml/blob/master/2020/s08-nlp/img/tfidf.jpg?raw=1" width="800"></center>
Image from: http://filotechnologia.blogspot.com/2014/01/a-simple-java-class-for-tfidf-scoring.html

In [None]:
#calc tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [None]:
# Fit TF-IDF on train texts
vectorizer = TfidfVectorizer(max_features = 25) # select the top 25 words
vectorizer.fit(train_texts)

# The top 25 words
vectorizer.get_feature_names_out()

In [None]:
# Apply TF-IDF to train and test texts
train_X = vectorizer.fit_transform(train_texts)
test_X  = vectorizer.fit_transform(test_texts)

In [None]:
# Example
train_X.todense()[:2] # show the first 2 rows

In [None]:
# word - column in X accordance
vectorizer.vocabulary_

## Fit a classifier

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_X, train_y)

### Evaluate on test data

In [None]:
predict = model.predict(test_X)
proba  = model.predict_proba(test_X)

from sklearn.metrics import accuracy_score, roc_auc_score
print("ACCURACY = {}".format(accuracy_score(test_y, predict)))
print("ROC-AUC =  {}".format(roc_auc_score(test_y, proba[:, 1])))

**Results:** 25 words are too small to estimate news sentiment properly. We need more words. But how will we deal with high dimensionalities?

# Latent Semantic Analysis (LSA)

LSA is just similar to PCA. It reduces dimension of the input matrix X.

<center><img src="https://github.com/shestakoff/hse_se_ml/blob/master/2020/s08-nlp/img/lsa.jpg?raw=1" width="800"></center>

Let's take more words.

In [None]:
# Fit TF-IDF on train texts
vectorizer = TfidfVectorizer(max_features = 40000)
vectorizer.fit(train_texts)

In [None]:
# Apply TF-IDF to train and test texts
train_X = vectorizer.transform(train_texts)
test_X  = vectorizer.transform(test_texts)

In [None]:
train_X.shape

Now we have 40000 words. But it is too large for a classification model. Let's use LSA to reduce dimension. In sklearn LSA is TruncatedSVD

In [None]:
from sklearn.decomposition import TruncatedSVD

# fit SVD decomposition
svd = TruncatedSVD(n_components = 1000)
svd.fit(train_X)

In [None]:
# apply SVD to train and test samples
train_svd_X = svd.transform(train_X)
test_svd_X  = svd.transform(test_X)

In [None]:
train_svd_X.shape

## Fit a classifier

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(train_svd_X, train_y)

### Evaluate on test data

In [None]:
predict = model.predict(test_svd_X)
proba   = model.predict_proba(test_svd_X)

from sklearn.metrics import accuracy_score, roc_auc_score
print("ACCURACY = {}".format(accuracy_score(test_y, predict)))
print("ROC-AUC =  {}".format(roc_auc_score(test_y, proba[:, 1])))

# Kaggle competition

https://www.kaggle.com/c/explicit-content-detection