In [71]:
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
from scipy.sparse import hstack

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from nltk.corpus import stopwords, wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import pos_tag, WordNetLemmatizer

from gensim.models.word2vec import Word2Vec

In [46]:
train_df = pd.read_csv('nlp/train.csv', encoding='unicode_escape') 

In [47]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2351 non-null   int64  
 1   Hotel_name    2351 non-null   object 
 2   Review_Title  2136 non-null   object 
 3   Review_Text   2351 non-null   object 
 4   Rating        2351 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 92.0+ KB


In [48]:
train_df.isna().sum()

Id                0
Hotel_name        0
Review_Title    215
Review_Text       0
Rating            0
dtype: int64

Выкинем все строки, содержащие NaN

In [49]:
train_df = train_df[train_df['Review_Title'].notna()]

In [50]:
train_df.head(10)

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0
5,5,Everest,"Good atmosphere, food and drinks not available","I reached the hotel by car, felt good for co-o...",71.0
6,6,Metro Grand,Lovely hotel,The hotel is pretty clean with excellent beddi...,80.0
7,7,Oyo Rooms Anna Arch Arumbakkam,Not worth the money,No hot water.wifi limited to lobby. Average cl...,40.0
9,9,FabHotel Priyadarshini Park Mount Road,Good hotel with poor services,Location and cleanliness is good. But as far a...,57.0
10,10,Treebo J's Five Two Boutique,Good appearance & amazing hotel to stay,It's amazing and I got so many benefits from t...,86.0


Напишем функцию, которая будет обрабатывать текст из датасета

In [51]:
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.NOUN


def clean_text(text, normalizer='stem'):
    """ Cleans text
    """
    assert(normalizer=='stem' or normalizer=='lem')

    text = text.lower()
    # tokenize
    text = [w for w in re.split(r'[&;,.\s]',text)]

    # remove numbers
    text = [w for w in text if not any(c.isdigit() for c in w)]
    
    # remove stop words
    stop = stopwords.words('english')
    text = [w for w in text if w not in stop]

    # remove empty tokens
    text = [w for w in text if len(w) > 0]

    # normalize
    if normalizer == 'stem':
        stemmer = SnowballStemmer(language='english', ignore_stopwords=True)
        text = [stemmer.stem(w) for w in text]
    else:
        # pos tag text
        pos_tags = pos_tag(text)
        # lemmatize text
        text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]

    # remove words with only one letter
    text = [w for w in text if len(w) > 1]

    # join all
    text = " ".join(text)
    return text

Обработаем текст признаков `Review_Text` и `Review_Title`

In [52]:
normalizer = 'lem'
train_df['Review_Text_Clean'] = train_df['Review_Text'].apply(lambda x: clean_text(x, normalizer))
train_df['Review_Title_Clean'] = train_df['Review_Title'].apply(lambda x: clean_text(x, normalizer))

### Modeling based only on text
#### Bag of Words

Объединим все текстовые признаки

In [53]:
X_text = train_df['Review_Text_Clean'] + train_df['Review_Title_Clean']
y_train = train_df['Rating']

In [54]:
count_vect = CountVectorizer(ngram_range=(1,1))
X_text_cv = count_vect.fit_transform(np.copy(X_text))
# count_vect.vocabulary_

idf_vect1 = TfidfVectorizer(ngram_range=(1,1))
X_text_tv1 = idf_vect1.fit_transform(np.copy(X_text))

idf_vect2 = TfidfVectorizer(ngram_range=(1,2))
X_text_tv2 = idf_vect2.fit_transform(np.copy(X_text))

In [55]:
import warnings
warnings.filterwarnings("ignore")

`SGD Regressor`

In [91]:
sgd_reg = SGDRegressor(random_state=42, max_iter=2000)
# reg = RandomForestRegressor(random_state=42)
cvs_sgd_cv = cross_val_score(sgd_reg, X_text_cv, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_sgd_tv1 = cross_val_score(sgd_reg, X_text_tv1, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_sgd_tv2 = cross_val_score(sgd_reg, X_text_tv2, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Average CV score SGDRegressor + CountVectorizer = {np.mean(-cvs_sgd_cv):.2f}")
print(f"Average CV score SGDRegressor + TfidfVectorizer with ngram 1 = {np.mean(-cvs_sgd_tv1):.2f}")
print(f"Average CV score SGDRegressor + TfidfVectorizer with ngram 2 = {np.mean(-cvs_sgd_tv2):.2f}")

Average CV score SGDRegressor + CountVectorizer = 24.76
Average CV score SGDRegressor + TfidfVectorizer with ngram 1 = 16.73
Average CV score SGDRegressor + TfidfVectorizer with ngram 2 = 15.76


`Decision Trees`

In [80]:
tree_reg = DecisionTreeRegressor(random_state=42, max_depth=12)
cvs_tree_cv = cross_val_score(tree_reg, X_text_cv, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_tree_tv1 = cross_val_score(tree_reg, X_text_tv1, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_tree_tv2 = cross_val_score(tree_reg, X_text_tv2, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Average CV score Decision Tree + CountVectorizer = {np.mean(-cvs_tree_cv):.2f}")
print(f"Average CV score Decision Tree + TfidfVectorizer with ngram 1 = {np.mean(-cvs_tree_tv1):.2f}")
print(f"Average CV score Decision Tree + TfidfVectorizer with ngram 2 = {np.mean(-cvs_tree_tv2):.2f}")

Average CV score Decision Tree + CountVectorizer = 18.10
Average CV score Decision Tree + TfidfVectorizer with ngram 1 = 18.25
Average CV score Decision Tree + TfidfVectorizer with ngram 2 = 18.87


`Random Forest`

In [84]:
rf_reg = RandomForestRegressor(random_state=42, max_depth=12)
cvs_rf_cv = cross_val_score(rf_reg, X_text_cv, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_rf_tv1 = cross_val_score(rf_reg, X_text_tv1, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_rf_tv2 = cross_val_score(rf_reg, X_text_tv2, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Average CV score Random Forest + CountVectorizer = {np.mean(-cvs_rf_cv):.2f}")
print(f"Average CV score Random Forest + TfidfVectorizer with ngram 1 = {np.mean(-cvs_rf_tv1):.2f}")
print(f"Average CV score Random Forest + TfidfVectorizer with ngram 2 = {np.mean(-cvs_rf_tv2):.2f}")

Average CV score Random Forest + CountVectorizer = 16.64
Average CV score Random Forest + TfidfVectorizer with ngram 1 = 16.51
Average CV score Random Forest + TfidfVectorizer with ngram 2 = 16.52


#### Word2Vec

In [57]:
n_features = 100

text_corpus = [x.split(" ") for x in X_text]
w2v = Word2Vec(text_corpus, min_count=1, sg=1, vector_size=n_features)

In [58]:
X_text_w2v = [np.mean([w2v.wv[w] for w in review.split(" ")], axis=0) for review in X_text]
X_text_w2v = np.vstack(X_text_w2v)

In [85]:
cvs_sgd_w2v = cross_val_score(sgd_reg, X_text_w2v, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_tree_w2v = cross_val_score(tree_reg, X_text_cv, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_rf_w2v = cross_val_score(rf_reg, X_text_cv, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Average CV score SGD + Word2Vec = {np.mean(-cvs_sgd_w2v):.2f}")
print(f"Average CV score Tree + Word2Vec = {np.mean(-cvs_tree_w2v):.2f}")
print(f"Average CV score Random Forest + Word2Vec = {np.mean(-cvs_rf_w2v):.2f}")

Average CV score SGD + Word2Vec = 18.05
Average CV score Tree + Word2Vec = 18.10
Average CV score Random Forest + Word2Vec = 16.64


### Modelling using all features

Выведем новые признаки из текстовых признаков

In [60]:
# Количество слов 
train_df['Review_Text_Count'] = train_df['Review_Text'].apply(lambda x: len(x.split(" ")))
train_df['Review_Title_Count'] = train_df['Review_Title'].apply(lambda x: len(x.split(" ")))

In [68]:
# Сентимент
sid = SentimentIntensityAnalyzer()
sid.polarity_scores(train_df['Review_Text_Clean'][0])

X_text_sent = X_text.apply(lambda review: sid.polarity_scores(review))
X_text_comp = X_text_sent.apply(lambda score_dict: score_dict['compound'])
X_text_comp = X_text_comp.to_numpy().reshape(-1,1)

Преобразуем категориальный признак `Hotel_name` в OneHotVector 

In [62]:
X_counts = train_df[['Review_Title_Count', 'Review_Text_Count']]
scalar = StandardScaler()
X_counts = scalar.fit_transform(X_counts)

X_names = pd.get_dummies(train_df['Hotel_name'], sparse=True).to_numpy()

In [69]:
X_train = np.hstack((X_text_w2v, X_names, X_counts, X_text_comp))

In [86]:
sgd_reg = SGDRegressor(random_state=42, max_iter=3000)
tree_reg = DecisionTreeRegressor(random_state=42, max_depth=12)
rf_reg = RandomForestRegressor(random_state=42, max_depth=12)

cvs_sgd = cross_val_score(sgd_reg, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_tree = cross_val_score(tree_reg, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cvs_rf = cross_val_score(rf_reg, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"Average CV score SGD = {np.mean(-cvs_sgd):.2f}")
print(f"Average CV score Tree = {np.mean(-cvs_tree):.2f}")
print(f"Average CV score Random Forest = {np.mean(-cvs_rf):.2f}")

Average CV score SGD = 15.92
Average CV score Tree = 18.33
Average CV score Random Forest = 14.28


## Выводы

Я оценивал качество моделей на основе `cross_val_score`, хотя можно было бы и разбить на трейн тест сет, но из-за небольшой выборки я не стал этого делать. 

Я попробовал применить стемминг и лемматизацию, большой разницы не увидел между ними в плане `cross_val_score`. В плане векторизации я попробовал `Countvectorizer`, `TfidfVectorizer` и `Word2Vec`; для `TfidfVectorizer` попробовал 1 и 2 нграммы.  `cross_val_score` у `TfidfVectorizer` и `Word2Vec` были самыми лучшими. Однако у `Word2Vec` размерность вектора намного меньше чем у `TfidfVectorizer`, особенно когда используем 2 н-граммы. Так как отличие в метрике не большое, то лучше в дальнейшем я использовал `Word2Vec` признаки. 

Я попробовал вытащить из текста новые признаки, например количество слов в отзыве и названии отзыва, а также оценку сентимента. Думаю, именно оценка сентимента дал небольшой прирост к метрике. В целом все полученные модели плохо предсказывают оценку рейтинга. Это может быть связана с тем, что данных мало.