In [1]:
import numpy as np
import pandas as pd

In [53]:
# data wrangling
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("app_review_rating_train.csv")

dataset["Reviews"] = dataset["Title"] + " " + dataset["Review"]
dataset = dataset.dropna()

# oversampling
df = dataset.loc[dataset['Rating'] == 1]
df = pd.concat([df]*3)
dataset = dataset.append(df)

df = dataset.loc[dataset['Rating'] == 2]
df = pd.concat([df]*5)
dataset = dataset.append(df)

df = dataset.loc[dataset['Rating'] == 3]
df = pd.concat([df]*4)
dataset = dataset.append(df)

df = dataset.loc[dataset['Rating'] == 4]
df = pd.concat([df]*2)
dataset = dataset.append(df)

print(dataset.groupby(["Rating"]).count())
train_reviews, test_reviews, train_rating, test_rating = train_test_split(dataset["Reviews"], dataset["Rating"],
                                                                          stratify=dataset["Rating"],
                                                                          test_size=0.33,
                                                                          random_state=42)

train_reviews = np.array(train_reviews)
train_rating = np.array(train_rating)
test_reviews = np.array(test_reviews)
test_rating = np.array(test_rating)

75381


190649
         Date  AppName  Language  Version  Title  Review  Reviews
Rating                                                           
1       46520    46520     46520    46520  46520   46520    46520
2       27071    27071     27071    27071  27071   27071    27071
3       32499    32499     32499    32499  32499   32499    32499
4       33600    33600     33600    33600  33600   33600    33600
5       50959    50959     50959    50959  50959   50959    50959


In [54]:
# normalization

import re
import nltk

stopwords_list = nltk.corpus.stopwords.words('russian')


def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens


def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopwords_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


def normalize_corpus(corpus):
    normalized_corpus = []
    
    for index, text in enumerate(corpus):
        text = text.lower()
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
        text = re.sub('[\W]+', ' ', text.lower()) \
            + ' '.join(emoticons).replace('-', '')
        text = remove_stopwords(text)
        normalized_corpus.append(text)

    return normalized_corpus


norm_train_reviews = normalize_corpus(train_reviews)
norm_test_reviews = normalize_corpus(test_reviews)

In [55]:
# feature extraction

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    
    return vectorizer, feature_matrix


vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews,
                                                  feature_type='frequency',
                                                  ngram_range=(1, 2), 
                                                  min_df=0.0, max_df=1.0)   

norm_test_reviews = normalize_corpus(test_reviews)
test_features = vectorizer.transform(norm_test_reviews)     

In [60]:
# model training

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score

# best model selection and hyperparameters optimization

for clf in [LogisticRegression, SGDClassifier, MultinomialNB]:
    print(clf)
    print(cross_val_score(clf(), train_features, train_rating).mean())
    print("\n")
    
nb = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
nb.fit(train_features, train_rating)

sgd = SGDClassifier(loss='log', penalty='l2')
sgd.fit(train_features, train_rating)

lg = LogisticRegression(solver='newton-cg',
                        multi_class='multinomial',
                        C=1,
                        penalty='l2',
                        max_iter=100,
                        random_state=42)
lg.fit(train_features, train_rating)



<class 'sklearn.linear_model.logistic.LogisticRegression'>


0.9206945783


<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>








0.897270901145


<class 'sklearn.naive_bayes.MultinomialNB'>


0.849953844406






LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [61]:
# model evaluation and metrics
from sklearn import metrics


def display_evaluation_metrics(true_labels, predicted_labels):
    
    print('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='micro'),
                        2))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='micro'),
                        2))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='micro'),
                        2))
               
                        
def display_classification_report(true_labels, predicted_labels, classes=[1,2,3,4,5]):

    report = metrics.classification_report(y_true=true_labels, 
                                           y_pred=predicted_labels, 
                                           labels=classes) 
    print(report)
    

def display_model_quality(model, test_features, test_rating):
    predicted_sentiments = model.predict(test_features)
    
    print(model)
    display_evaluation_metrics(true_labels=test_rating,
                               predicted_labels=predicted_sentiments)
    
    display_classification_report(true_labels=test_rating,
                                  predicted_labels=predicted_sentiments,
                                  classes=[1,2,3,4,5]) 


display_model_quality(nb, test_features, test_rating)    
display_model_quality(sgd, test_features, test_rating)
display_model_quality(lg, test_features, test_rating)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Precision: 0.87
Recall: 0.87
F1 Score: 0.87


             precision    recall  f1-score   support

          1       0.85      0.95      0.89     15352
          2       0.95      0.90      0.92      8933
          3       0.91      0.88      0.90     10725
          4       0.87      0.71      0.78     11088
          5       0.84      0.90      0.87     16817

avg / total       0.88      0.87      0.87     62915

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Precision: 0.86
Recall: 0.86
F1 Score: 0.86
             precision    recall  f1-score   support

          1       0.83      0.94      0.88     15352
          2       0.96      0.87      0.91      8933
          3       0.92      0.86      0.89     10725
          4       0.86      0.66      0.75     

Recall: 0.95
F1 Score: 0.95
             precision    recall  f1-score   support

          1       0.96      0.98      0.97     15352
          2       0.98      0.98      0.98      8933
          3       0.97      0.98      0.97     10725
          4       0.91      0.89      0.90     11088
          5       0.92      0.91      0.92     16817

avg / total       0.95      0.95      0.95     62915



In [67]:
# real data test

train_dataset = pd.read_csv("app_review_rating_train.csv")
test_dataset = pd.read_csv("app_review_rating_test.csv")

train_dataset["Reviews"] = train_dataset["Title"] + " " + train_dataset["Review"]
train_dataset = train_dataset.dropna()

# oversampling
df = train_dataset.loc[train_dataset['Rating'] == 1]
df = pd.concat([df]*3)
train_dataset = train_dataset.append(df)

df = train_dataset.loc[train_dataset['Rating'] == 2]
df = pd.concat([df]*5)
train_dataset = train_dataset.append(df)

df = train_dataset.loc[train_dataset['Rating'] == 3]
df = pd.concat([df]*4)
train_dataset = train_dataset.append(df)

df = train_dataset.loc[train_dataset['Rating'] == 4]
df = pd.concat([df]*2)
train_dataset = train_dataset.append(df)

test_dataset["Reviews"] = test_dataset["Title"] + " " + test_dataset["Review"]
test_dataset = test_dataset.dropna()

train_reviews = np.array(train_dataset["Reviews"])
train_rating = np.array(train_dataset["Rating"])
test_reviews = np.array(test_dataset["Reviews"])

norm_train_reviews = normalize_corpus(train_reviews)
norm_test_reviews = normalize_corpus(test_reviews)

vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews,
                                                  feature_type='frequency',
                                                  ngram_range=(1, 2), 
                                                  min_df=0.0, max_df=1.0)   


lg.fit(train_features, train_rating)

test_features = vectorizer.transform(norm_test_reviews)
predicted_sentiment = nb.predict(test_features)

test_dataset["Rating"] = predicted_sentiment
test_dataset = test_dataset.drop(["Reviews"], axis=1)

print(test_dataset["Rating"].value_counts().sort())


test_dataset.to_csv("results.csv", sep='\t', encoding='utf-8')


5    16635
1     5381
4     1718
3     1045
2      311
Name: Rating, dtype: int64
    Index        Date                 AppName Language Version  \
0       0  2016-09-22         Сбербанк Онлайн       ru   7.3.1   
1       1  2016-03-11         Сбербанк Онлайн       ru   7.0.3   
2       2  2015-02-14         Сбербанк Онлайн       ru   5.3.0   
3       3  2017-02-18         Сбербанк Онлайн       ru   8.0.0   
4       4  2014-05-07         Сбербанк Онлайн       ru   4.1.3   
5       5  2016-09-20         Сбербанк Онлайн       ru   7.3.1   
6       6  2016-02-17         Сбербанк Онлайн       ru   7.0.2   
7       7  2017-10-24  Сбербанк Бизнес Онлайн       ru  2.12.1   
8       8  2016-06-17         Сбербанк Онлайн       ru   7.2.0   
9       9  2013-02-20         Сбербанк Онлайн       ru     3.0   
10     10  2014-04-22         Сбербанк Онлайн       ru   4.1.1   
11     11  2017-01-28         Сбербанк Онлайн       ru   7.4.4   
12     12  2014-09-07         Сбербанк Онлайн       ru   5.1

In [69]:
import pickle

pickle.dump(lg, open("lg_app_rating.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))


AttributeError: 'Series' object has no attribute 'sort'