In [18]:
import re
import numpy as np
import pymorphy3

from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
import pickle

In [28]:
def save_file(name, obj, path):
    with open('{path}/{name}.pickle'.format(name=name, path=path), 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
special_characters = ['</p>', '\xa0', '<br/>', '\n', '-&gt;']

In [21]:
stop_words = ['в', 'на', 'с', 'и', 'я', 'о', 'к', 'а', 'за', 'по',
              'об', 'у', 'бы', 'от', 'ли', 'ул', 'что', 'со', 'из',
              'для', 'про', 'г', 'до', 'то', 'быть', 'по', 'мочь',
              'январь','февраль','март','апрель','май','июнь','июль',
              'август','сентябрь','октябрь','ноябрь','декабрь','год',
              'стать','это','когда','даже', 'янв', 'фев', 'апр', 'авг',
              'сен', 'окт', 'ноя', 'дек', 'ые', 'ый', 'ым', 'ых', 'ая', '', '', '', '', '', '', '']


In [22]:
f = open("data/reviews_positive.txt", "r")
positive_reviews = f.read().split('<p>')
positive_reviews.pop(0)

f = open("data/reviews_neutral.txt", "r")
neutral_reviews = f.read().split('<p>')
neutral_reviews.pop(0)

f = open("data/reviews_negative.txt", "r")
negative_reviews = f.read().split('<p>')
negative_reviews.pop(0)

''

In [23]:
def cleansing(text):
    res = []
    for i in range(len(text)):
        res.append(text[i])
        for sp_ch in special_characters:
            res[i] = (text[i].replace(sp_ch,' '))
        res[i] = res[i].lower()
        res[i] = re.sub(r'[^\w\s]', ' ', res[i])
        res[i] = re.sub(r'[^\w\s]+|[\d]+', r' ',res[i]).strip()
        res[i] = re.sub('[a-zA-Z\s]+', ' ', res[i])
        res[i] = re.sub(" +", " ", res[i])
    return res

morph = pymorphy3.MorphAnalyzer()

def lemmatize(sentence):
    words = sentence.split() # разбиваем текст на слова
    res = list()
    for word in words:
        p = morph.parse(word)[0]
        res.append(p.normal_form)
    return res

def delete_stop_words(sentence):
    res = list()
    for word in sentence:
        if word not in stop_words:
            res.append(word)
    return res

def join(sentence):
    return ' '.join(sentence)


In [24]:
positive_text = cleansing(positive_reviews)
lemmatize_positive_text = list(map(lemmatize,positive_text))
clean_positive_text = list(map(delete_stop_words,lemmatize_positive_text))
joined_positive_reviews = list(map(join,clean_positive_text))

neutral_text = cleansing(neutral_reviews)
lemmatize_neutral_text = list(map(lemmatize,neutral_text))
clean_neutral_text = list(map(delete_stop_words,lemmatize_neutral_text))
joined_neutral_reviews = list(map(join,clean_neutral_text))

negative_text = cleansing(negative_reviews)
lemmatize_negative_text = list(map(lemmatize,negative_text))
clean_negative_text = list(map(delete_stop_words,lemmatize_negative_text))
joined_negative_reviews = list(map(join,clean_negative_text))

In [25]:
vectorizer = TfidfVectorizer()

data = np.concatenate((joined_positive_reviews,joined_neutral_reviews))
data = np.concatenate((data,joined_negative_reviews))

X = vectorizer.fit_transform(data)
#vectorizer.get_feature_names_out()

#s = pickle.dumps(vectorizer)
dump(vectorizer, 'vectorizer.joblib')


['vectorizer.joblib']

In [29]:
save_file('data_positive', joined_positive_reviews, 'prepared_data')
save_file('data_neutral', joined_neutral_reviews, 'prepared_data')
save_file('data_negative', joined_negative_reviews, 'prepared_data')

In [None]:
negative_reviews[0]

In [None]:
joined_negative_reviews[0]

In [None]:
vectorizer = TfidfVectorizer()

data = np.concatenate((joined_positive_reviews,joined_neutral_reviews))
data = np.concatenate((data,joined_negative_reviews))

X = vectorizer.fit_transform(data)
vectorizer.get_feature_names_out()