In [2]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.probability  import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk import bigrams
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from multiprocessing import Pool
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd

In [3]:
def lower_pos_tag(words):
    lower_words = []
    for i in words:
        lower_words.append(i.lower())
    pos_words = pos_tag(lower_words)
    return pos_words

In [4]:
def clean(words):
    wordnet_lemmatizer = WordNetLemmatizer()
    cleaned_words = []
    types = ['JJ', 'JJR', 'JJS', 
             'MD', 'NN', 'NNS', 
             'NP', 'NPS', 'POS', 
             'PP', 'RB', 'RBR', 
             'RBS', 'VV', 'VVD', 
             'VVG', 'VVN', 'VVP', 'VVZ']
    for i in words:
        if i[1] in types:
            cleaned_words.append(wordnet_lemmatizer.lemmatize(i[0]))
    return cleaned_words

In [5]:
def get_review_by_status(data_path, status):
    frame = pd.read_csv(data_path, header=0, sep=',')
    return frame[frame['status'].isin([status])].review.to_list()

def get_review_by_label(data_path, label):
    if (label == 'neutral'):
        return get_review_by_status(data_path, 0)
    elif (label == 'bad'):
        return get_review_by_status(data_path, -1)
    elif (label == 'good'):
        return get_review_by_status(data_path, 1)
    else:
        return None

def process(data_path, label):
    # Wordmatrix - список документов с лексемами
    # All words - список всех слов
    data = {'Word_matrix': [], 'All_words': []}
    # Промежуточный список для удаления гапаксов
    templist_allwords = []
    reviews = get_review_by_label(data_path, label)
    # Создание токенайзера
    tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')
    for review in reviews: # Обработка корпуса
        bag_words = tokenizer.tokenize(review)
        lower_words = lower_pos_tag(bag_words)
        cleaned_words = clean(lower_words)
        finalist = list(bigrams(cleaned_words)) + cleaned_words
        data['Word_matrix'].append(finalist)
        templist_allwords.extend(cleaned_words)
    # Определение гапаксов
    templistfreq = FreqDist(templist_allwords)
    hapaxes = templistfreq.hapaxes()
    # Фильтрация от гапаксов
    for word in templist_allwords:
        if word not in hapaxes:
            data['All_words'].append(word)
    return {label: data}

# Векторизация

In [7]:
# Создание помеченный данных со структурой:
# [([список слов отзыва], метка_класса)]
def union_data(data):
    labels = ['neutral', 'bad', 'good']
    labeled_data = []
    for label in labels:
        for document in data[label]['Word_matrix']:
            labeled_data.append((document, label))
    return labeled_data

In [8]:
def vocabular_create(labels, data):
    # Создание вокабуляра с уникальными лексемами
    all_words = [] 
    for label in labels:
        frequency = FreqDist(data[label]['All_words'])
        common_words = frequency.most_common(1000)
        words = [i[0] for i in common_words]
        all_words.extend(words)
    # Извлечение уникальных лексем
    unique_words = list(OrderedDict.fromkeys(all_words))
    return unique_words

In [9]:
 def frequency_coding(labeled_data, unique_words):
    # Частотное кодирование для классификаторов scikit-learn
    # Разреженная матрица для признаков
    matrix_vec = csr_matrix((len(labeled_data), len(unique_words)), dtype=np.int8).toarray()
    # Массив для меток классов
    target = np.zeros(len(labeled_data), 'str')
    for index_doc, document in enumerate(labeled_data):
        for index_word, word in enumerate(unique_words):
            # Подсчет кол-ва вхождения слова в отзыв
            matrix_vec[index_doc, index_word] = document[0].count(word)
        target[index_doc] = document[1]
    return matrix_vec, target

# Обучение модели

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [23]:
# Считывание, обработка и векторизация обучающих данных
teaching_data = {}
labels = ['neutral', 'bad', 'good']
for label in labels:
    teaching_data.update(process('teaching_sentiments.csv', label))

unique_words = vocabular_create(labels, teaching_data)

matrix_vec, target = frequency_coding(union_data(teaching_data), unique_words)

# Перемешиваем датасет
X, Y = shuffle(matrix_vec, target)

model = MultinomialNB(0.1)
model.fit(X, Y)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

# Проверка на отзывах Parallels

In [None]:
# Предварительная обработка теста
testing_data = {}
labels = ['neutral', 'bad', 'good']
for label in labels:
    testing_data.update(process('testing_sentiments.csv', label))

In [25]:
# Векторизация теста
X_test, Y_test = frequency_coding(union_data(testing_data), unique_words)

In [26]:
predicted = model.predict(X_test)
# Точность на контрольном датасете
score_test = accuracy_score(Y_test, predicted)
# Классификационный отчет
report = classification_report(Y_test, predicted)

print(score_test)
print(report)

0.45907473309608543
              precision    recall  f1-score   support

           b       0.39      0.90      0.54        81
           g       0.62      0.53      0.57       100
           n       0.38      0.03      0.06       100

    accuracy                           0.46       281
   macro avg       0.46      0.49      0.39       281
weighted avg       0.47      0.46      0.38       281



# Проверка на отзывах Excel

In [27]:
# Предварительная обработка теста
testing_data = {}
labels = ['neutral', 'bad', 'good']
for label in labels:
    testing_data.update(process('appstore_excel_modified.csv', label))

In [28]:
# Векторизация теста
X_test, Y_test = frequency_coding(union_data(testing_data), unique_words)

In [22]:
len(X_test)

300

In [29]:
predicted = model.predict(X_test)
# Точность на контрольном датасете
score_test = accuracy_score(Y_test, predicted)
# Классификационный отчет
report = classification_report(Y_test, predicted)

print(score_test)
print(report)

0.4866666666666667
              precision    recall  f1-score   support

           b       0.44      0.90      0.59       100
           g       0.60      0.53      0.56       100
           n       0.38      0.03      0.06       100

    accuracy                           0.49       300
   macro avg       0.47      0.49      0.40       300
weighted avg       0.47      0.49      0.40       300

