# День 1. NLP. Векторизация текстов

## Описание

Дано: текст на каком-то языке (ca, de, el, en, ...), написанный на латинице (транслитом)  
Найти: язык текста  

`data/plain/train/*.txt` - обучающая выборка  
`data/plain/test/*.txt` - тестовая выборка  

Для начала векторизуем текст при помощи N-грамм:
1) Найдем все N-граммы в тексте (мама мыла раму -> {мам, ама, ма_, а_м, _мы, мыл, ...})
2) Пронумеруем их
3) Для каждого текста подсчитаем число всех N-грамм 
4) Запишем это в виде вектора $v$, где $v_i$ - число вхождений $i$-й N-граммы в текст

## Реализация

Реализуем 2 класса для обработки текстов:
1. `TextPreparer`, при помощи которого будем удалять несущественные символы
2. `NGramVectorizer`, при помощи которого будем получать вектора для текста

In [1]:
import os
import numpy as np
from tqdm.notebook import tqdm
import typing as t
from collections.abc import Iterable
import pandas as pd

In [2]:
DATA_PATH = "data" + os.sep + "plain"

#### Считаем данные

In [3]:
class Data:
    FILE_POSTFIX = ".txt"
    
    def __init__(self, data_path: str):
        self.texts = []
        self.labels = []
        
        filenames = os.listdir(data_path)
        filenames = list(filter(lambda x: x.endswith(type(self).FILE_POSTFIX), filenames))
        
        for filename in filenames:
            filepath = os.sep.join((data_path, filename))
            with open(filepath, "r", encoding="utf-8") as file:
                text = file.read()
            self.texts.append(text)
            self.labels.append(filename[:-len(type(self).FILE_POSTFIX)])

In [4]:
train_data = Data(DATA_PATH + os.sep + "train")
test_data = Data(DATA_PATH + os.sep + "test")

In [5]:
counter = 0
label_code = dict()
for label in train_data.labels:
    label_code[label] = counter
    counter += 1

In [6]:
y_train = list(map(lambda x: label_code[x], train_data.labels))

In [7]:
test_data_targets = pd.read_csv(os.sep.join((DATA_PATH, "test", "ans.csv")), header=None)
test_data_targets[1] = test_data_targets[1].apply(lambda x: label_code[x])
y_test = list(map(lambda x: test_data_targets.values[int(x)-1][1], test_data.labels))

#### TextPreparer

In [8]:
class TextPreparer:
    PUNCTUATION_CHARS = (',', ';', '.', ':', '?', '!', '-', '\n')
    
    def __init__(self, min_count: int, delete_punct: bool = True):
        self.__min_count = min_count
        self.__delete_punct = delete_punct
        self.__rare_chars = None
    
    def fit(self, texts: Iterable[str]) -> None:
        chars_count = dict()
        
        for text in texts:
            for ch in text:
                if ch not in chars_count:
                    chars_count[ch] = 1
                else:
                    chars_count[ch] += 1
                    
        self.__rare_chars = []
        for ch, ch_count in chars_count.items():
            if ch_count < self.__min_count:
                self.__rare_chars.append(ch)
                    
    def transform(self, text: str) -> str:
        if self.__rare_chars is None:
            raise Exception("There must be fit() call before transform()")
            
        for rare_ch in self.__rare_chars:
            text = text.replace(rare_ch, "")

        if self.__delete_punct:
            for punct_ch in type(self).PUNCTUATION_CHARS:
                text = text.replace(punct_ch, "")
        
        return text

#### NGramVectorizer

In [9]:
class NGramVectorizer:
    def __init__(self, n):
        self.__n = n
        self.__ngram_number = None
        
    def __get_ngrams(self, text: str) -> Iterable[str]:
        ngrams = []
        for i in range(len(text) - self.__n + 1):
            ngram = text[i:(i + self.__n)]
            ngrams.append(ngram)
        return ngrams
        
    def fit(self, texts: Iterable[str]) -> None:
        self.__ngram_number = dict()
        cur_number = 0

        for text in texts:
            for ngram in self.__get_ngrams(text):
                if ngram not in self.__ngram_number:
                    self.__ngram_number[ngram] = cur_number
                    cur_number += 1
        
    def transform(self, text: str) -> np.ndarray:
        if self.__ngram_number is None:
            raise Exception("There must be fit() call before transform()")
        
        res = np.zeros((len(self.__ngram_number),), int)
        
        for ngram in self.__get_ngrams(text):
            if ngram in self.__ngram_number:
                res[self.__ngram_number[ngram]] += 1
                
        res = (res - res.mean()) / res.std()
    
        return res

#### Векторизация текстов

In [10]:
X_train = train_data.texts
X_test = test_data.texts

In [11]:
text_preparer = TextPreparer(min_count=100)
vectorizer = NGramVectorizer(n=3)

In [12]:
%%time
text_preparer.fit(train_data.texts)

Wall time: 5.88 s


In [13]:
%%time

train_texts = []
for text in tqdm(X_train, leave=False):
    train_texts.append(text_preparer.transform(text))
    
X_train = train_texts
    
test_texts = []
for text in tqdm(X_test, leave=False):
    test_texts.append(text_preparer.transform(text))
    
X_test = test_texts

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Wall time: 2.57 s


In [14]:
%%time
vectorizer.fit(train_data.texts)

Wall time: 17.5 s


In [15]:
%%time

train_texts = []
for text in tqdm(X_train, leave=False):
    train_texts.append(vectorizer.transform(text))
    
X_train = train_texts
    
test_texts = []
for text in tqdm(X_test, leave=False):
    test_texts.append(vectorizer.transform(text))
    
X_test = test_texts

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Wall time: 38.9 s


## Обучение модели

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [17]:
X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = np.array(X_test)
y_test = np.array(y_test)

In [18]:
clf = KNeighborsClassifier(1)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [19]:
print(f"Train accuracy: {accuracy_score(y_train_pred, y_train)}")
print(f"Test  accuracy: {accuracy_score(y_test_pred, y_test)}")

Train accuracy: 1.0
Test  accuracy: 1.0


In [20]:
y_test_pred

array([ 0,  0,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  0,  7,  7,  7,  7,
        7,  7,  7,  7,  7,  7,  0,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        0,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  0,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  0,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10,  1,
       10, 10, 10, 10, 10, 10, 11, 11, 11, 11,  1, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11,  1, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12,  1, 12,
       12, 12, 12, 12, 12, 13, 13, 13, 13,  0,  1, 13, 13, 13, 13, 13, 13,
       13, 13, 13, 13,  1, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,  1, 14,
       14, 14, 14, 14, 14, 15, 15, 15, 15,  1, 15, 15, 15, 15, 15, 15, 15,
       15, 15, 15,  1, 15,  1,  1,  1,  1,  1,  0,  1,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  0,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  0,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  0,  3,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  0,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  0,  5,  5,
        5,  5,  5,  5,  5

In [21]:
y_test

array([ 0,  0,  6,  6,  6,  6,  6,  6,  7,  7,  7,  7,  0,  7,  7,  7,  7,
        7,  7,  7,  7,  7,  7,  0,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        0,  8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  0,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  0,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10,  1,
       10, 10, 10, 10, 10, 10, 11, 11, 11, 11,  1, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11,  1, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12,  1, 12,
       12, 12, 12, 12, 12, 13, 13, 13, 13,  0,  1, 13, 13, 13, 13, 13, 13,
       13, 13, 13, 13,  1, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,  1, 14,
       14, 14, 14, 14, 14, 15, 15, 15, 15,  1, 15, 15, 15, 15, 15, 15, 15,
       15, 15, 15,  1, 15,  1,  1,  1,  1,  1,  0,  1,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  0,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  0,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  0,  3,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  0,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  0,  5,  5,
        5,  5,  5,  5,  5

In [25]:
mask = (y_test != y_test_pred)
np.sum(mask)

0

## Результат

In [26]:
def predict(text: str) -> str:
    text = text_preparer.transform(text)
    text = vectorizer.transform(text)
    answer_code = clf.predict(text[np.newaxis, :])[0]
    for k, v in label_code.items():
        if v == answer_code:
            return k

In [65]:
predict("Eto prostoy primer togo chto vse i tak horosho rabotaet")

'ru'

In [68]:
predict("This is a simple example of how everything works so well")

'en'