# Слияние двух датасетов

In [None]:
import pandas as pd

In [7]:
frame = pd.read_csv('appstore_parallels_categories.csv', header=0, sep=',')

In [8]:
frame

Unnamed: 0,review,categories
0,"Paid for 1 year for 3990 rubles, but the licen...",1
1,beyond expectation,0
2,very good,0
3,There was a two-week trial version\nI made a p...,1
4,"The charges are too expensive, and they are ch...",1
5,"downloaded, immediately after registration a m...",1
6,"I entered the card data for the subscription, ...",1
7,The best you can think of. Windows is just in ...,6
8,can get this dumb app to connect to wifi,4
9,Don't recommend for games. Won't be subscribing.,4


In [22]:
frame_uni = pd.concat([frame, frame2], ignore_index=True)

In [23]:
frame_uni

Unnamed: 0,review,categories
0,"Paid for 1 year for 3990 rubles, but the licen...",1
1,beyond expectation,0
2,very good,0
3,There was a two-week trial version\nI made a p...,1
4,"The charges are too expensive, and they are ch...",1
...,...,...
1018,Use this application for about 6 years already...,5
1019,Unable to get through to customer support to r...,1
1020,Followed the student discount offer on their s...,1
1021,It is a terrible product. I worked with v10 an...,4


In [11]:
frame2 = pd.read_csv('TrustPilot_categories.csv', header=0, sep=',')

In [24]:
frame_uni.to_csv('union_categories.csv', sep=',', header=True, index=False)

In [25]:
# Перемешивание датафрейма
frame_uni.sample(frac=1).reset_index(drop=True)

Unnamed: 0,review,categories
0,hi \ni dont see any options to install chrome ...,4
1,bad performance with ubuntu 18 and Mac OS,5
2,I was trying to get this to work on my new mac...,6
3,I was used to Parallels 15\ndownloaded from th...,4
4,If you are unable to provide Linux-based suppo...,1
...,...,...
1018,I have similar problem like one of the review....,4
1019,"..., because the login window always shows the...",6
1020,Excellent product and so easy to install on a ...,6
1021,I bought Parallels back when it first came out...,4


In [41]:
frame_uni[frame_uni['categories'].isin([6])].shape

(251, 2)

In [28]:
teaching_frame = frame_uni[:767]
testing_frame = frame_uni[767:]

In [30]:
teaching_frame.to_csv('teaching_categories.csv', sep=',', header=True, index=False)
testing_frame.to_csv('testing_categories.csv', sep=',', header=True, index=False)

# Предварительная обработка

In [13]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.probability  import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk import bigrams
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from multiprocessing import Pool
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd

In [14]:
def lower_pos_tag(words):
    lower_words = []
    for i in words:
        lower_words.append(i.lower())
    pos_words = pos_tag(lower_words)
    return pos_words

In [15]:
def clean(words):
    wordnet_lemmatizer = WordNetLemmatizer()
    cleaned_words = []
    types = ['JJ', 'JJR', 'JJS', 
             'MD', 'NN', 'NNS', 
             'NP', 'NPS', 'POS', 
             'PP', 'RB', 'RBR', 
             'RBS', 'VV', 'VVD', 
             'VVG', 'VVN', 'VVP', 'VVZ']
    for i in words:
        if i[1] in types:
            cleaned_words.append(wordnet_lemmatizer.lemmatize(i[0]))
    return cleaned_words

In [18]:
def get_review_by_categories(data_path, category):
    frame = pd.read_csv(data_path, header=0, sep=',')
    return frame[frame['categories'].isin([category])].review.to_list()

def get_review_by_label(data_path, label):
    if (label == 'payment'):
        return get_review_by_categories(data_path, 1)
    elif (label == 'functionality'):
        return get_review_by_categories(data_path, 4)
    elif (label == 'speed'):
        return get_review_by_categories(data_path, 5)
    elif (label == 'integration'):
        return get_review_by_categories(data_path, 6)
    elif (label == 'stuff'):
        return get_review_by_categories(data_path, 0)
    else:
        return None

def process(data_path, label):
    # Wordmatrix - список документов с лексемами
    # All words - список всех слов
    data = {'Word_matrix': [], 'All_words': []}
    # Промежуточный список для удаления гапаксов
    templist_allwords = []
    reviews = get_review_by_label(data_path, label)
    # Создание токенайзера
    tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')
    for review in reviews: # Обработка корпуса
        bag_words = tokenizer.tokenize(review)
        lower_words = lower_pos_tag(bag_words)
        cleaned_words = clean(lower_words)
        finalist = list(bigrams(cleaned_words)) + cleaned_words
        data['Word_matrix'].append(finalist)
        templist_allwords.extend(cleaned_words)
    # Определение гапаксов
    templistfreq = FreqDist(templist_allwords)
    hapaxes = templistfreq.hapaxes()
    # Фильтрация от гапаксов
    for word in templist_allwords:
        if word not in hapaxes:
            data['All_words'].append(word)
    return {label: data}

# Векторизация

In [19]:
# Создание помеченный данных со структурой:
# [([список слов отзыва], метка_класса)]
def union_data(data):
    labels = ['payment', 'functionality', 'speed', 'integration', 'stuff']
    labeled_data = []
    for label in labels:
        for document in data[label]['Word_matrix']:
            labeled_data.append((document, label))
    return labeled_data

In [20]:
def vocabular_create(labels, data):
    # Создание вокабуляра с уникальными лексемами
    all_words = [] 
    for label in labels:
        frequency = FreqDist(data[label]['All_words'])
        common_words = frequency.most_common(1000)
        words = [i[0] for i in common_words]
        all_words.extend(words)
    # Извлечение уникальных лексем
    unique_words = list(OrderedDict.fromkeys(all_words))
    return unique_words

In [21]:
 def frequency_coding(labeled_data, unique_words):
    # Частотное кодирование для классификаторов scikit-learn
    # Разреженная матрица для признаков
    matrix_vec = csr_matrix((len(labeled_data), len(unique_words)), dtype=np.int8).toarray()
    # Массив для меток классов
    target = np.zeros(len(labeled_data), 'str')
    for index_doc, document in enumerate(labeled_data):
        for index_word, word in enumerate(unique_words):
            # Подсчет кол-ва вхождения слова в отзыв
            matrix_vec[index_doc, index_word] = document[0].count(word)
        target[index_doc] = document[1]
    return matrix_vec, target

# Обучение модели

In [31]:
# Считывание, обработка и векторизация обучающих данных
teaching_data = {}
labels = ['payment', 'functionality', 'speed', 'integration', 'stuff']
for label in labels:
    teaching_data.update(process('teaching_categories.csv', label))

unique_words = vocabular_create(labels, teaching_data)

matrix_vec, target = frequency_coding(union_data(teaching_data), unique_words)

# Перемешиваем датасет
X, Y = shuffle(matrix_vec, target)

model = MultinomialNB(0.1)
model.fit(X, Y)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

# Тестирование

In [32]:
# Считывание, обработка и векторизация обучающих данных
testing_data = {}
labels = ['payment', 'functionality', 'speed', 'integration', 'stuff']
for label in labels:
    testing_data.update(process('testing_categories.csv', label))

In [34]:
# Векторизация теста
X_test, Y_test = frequency_coding(union_data(testing_data), unique_words)

# Перемешиваем датасет
X, Y = shuffle(X_test, Y_test)

In [35]:
predicted = model.predict(X_test)
# Точность на контрольном датасете
score_test = accuracy_score(Y_test, predicted)
# Классификационный отчет
report = classification_report(Y_test, predicted)

print(score_test)
print(report)

0.390625
              precision    recall  f1-score   support

           f       0.57      0.28      0.37        72
           i       0.40      0.48      0.44        92
           p       0.07      0.40      0.11         5
           s       0.41      0.39      0.40        87

    accuracy                           0.39       256
   macro avg       0.36      0.39      0.33       256
weighted avg       0.45      0.39      0.40       256

