In [29]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from catboost import CatBoostRegressor

In [61]:
data = pd.read_csv("./nlp/train.csv", encoding="cp1252")
data.head()


Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


Обучим линейную регрессию с параметрами по умолчанию.

In [32]:
texts = data.Review_Text
X = CountVectorizer().fit_transform(texts)
y = data.Rating

In [33]:
linreg = LinearRegression()
linreg.fit(X, y)
pred = linreg.predict(X)
print((-np.mean(cross_val_score(LinearRegression(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)
print(np.std(y, ddof=1))

134.3041332908244
21.11400297145028


Стандартное отклонение в несколько раз меньше, чем отклонение предсказанных значений. Попытаемся исправить ситуацию.

In [34]:
texts1 = texts.apply(lambda x: re.findall(r"\b[A-Za-z]+\b", x))
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
texts_reduced = texts1.apply(lambda x: [i for i in x if i not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\poisk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
stemmer = SnowballStemmer(language="english")
texts_stemm = texts_reduced.apply(lambda x: list(map(lambda y: stemmer.stem(y), x)))

In [36]:
texts_final = texts_stemm.str.join(" ")

In [37]:
X = CountVectorizer().fit_transform(texts_final)
print((-np.mean(cross_val_score(LinearRegression(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

143.81916730846453


Стало еще хуже.

In [38]:
X = CountVectorizer().fit_transform(texts_reduced.str.join(" "))
print((-np.mean(cross_val_score(LinearRegression(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

79.32243567311872


In [39]:
texts_final = texts_reduced.str.join(" ")

In [40]:
X = TfidfVectorizer().fit_transform(texts_final)
print((-np.mean(cross_val_score(LinearRegression(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

24.49580915381253


А вот tf idf справился получше.

In [41]:
best_score = np.inf
best_ngrams = tuple()
ngrams_list = [(j,i) for i in range(1, 7) for j in range(1, i+1)]
for ngrams in ngrams_list:
    X = TfidfVectorizer(ngram_range=ngrams).fit_transform(texts_final)
    score = (-np.mean(cross_val_score(LinearRegression(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5
    if best_score > score:
        best_score = score
        best_ngrams = ngrams
print(f"Лучший RMSE: {best_score} при n-граммах: от {best_ngrams[0]} до {best_ngrams[1]}.")

Лучший RMSE: 14.183353562655135 при n-граммах: от 1 до 2.


n-граммы помогли достичь неплохого результата

In [42]:
X = TfidfVectorizer(ngram_range=(1, 2)).fit_transform(texts_final)
print((-np.mean(cross_val_score(Lasso(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

21.114376226361674


L1 регрессия по умолчанию сработала похуже, чем n-граммы, поперебираем коэффициенты.

In [43]:
best_score = np.inf
best_ngrams = 0
alphas = np.concatenate((np.arange(0.1, 2, 0.1), np.arange(2, 10, 1)))
for alpha in alphas:
    score = (-np.mean(cross_val_score(Lasso(alpha=alpha), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5
    if best_score > score:
        best_score = score
        best_alpha = alpha
print(f"Лучший RMSE: {best_score} при коэффициенте: {best_alpha}.")

Лучший RMSE: 21.114376226361674 при коэффициенте: 0.1.
Лучший RMSE: 20.04065699882025 при коэффициенте: 0.1.


Получается все равно не очень, используем теперь L2

In [44]:
print((-np.mean(cross_val_score(Ridge(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

14.719073660316925


Переберем для нее коэффициенты

In [45]:
best_score = np.inf
best_ngrams = 0
alphas = np.concatenate((np.arange(0.1, 2, 0.1), np.arange(2, 10, 1)))
for alpha in alphas:
    score = (-np.mean(cross_val_score(Ridge(alpha=alpha), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5
    if best_score > score:
        best_score = score
        best_alpha = alpha
print(f"Лучший RMSE: {best_score} при коэффициенте: {best_alpha}.")

Лучший RMSE: 13.9101254629332 при коэффициенте: 0.1.


Результат у Ridge неплохой, попробуем бустинг

In [50]:
boost = CatBoostRegressor(iterations=300, loss_function="RMSE", verbose=False)
print((-np.mean(cross_val_score(boost, X, y, scoring="neg_mean_squared_error", cv=5))) ** 0.5)


14.906683077730209


Бустинг справился хуже Ridge регрессии

In [47]:
workers = multiprocessing.cpu_count() - 1
forest = RandomForestRegressor(n_jobs=workers)
print((-np.mean(cross_val_score(forest, X, y, scoring="neg_mean_squared_error", cv=5))) ** 0.5)

15.115697023617876


In [51]:
forest = RandomForestRegressor(random_state=42, n_jobs=workers)
parameters = {"n_estimators": [100, 200], "max_depth": [5, 10, 20], "max_features": [0.2, 0.333, "sqrt", "auto"], "max_samples": [0.1, 0.2, 0.5, 0.7, None]}
grid_forest = GridSearchCV(forest, param_grid=parameters, scoring="neg_mean_squared_error", cv=5)
grid_forest.fit(X, y)
grid_forest.best_params_


{'max_depth': 20,
 'max_features': 0.2,
 'max_samples': None,
 'n_estimators': 200}

In [52]:
forest = RandomForestRegressor(max_depth=20, max_features=0.2, max_samples=None,
                               n_estimators=200, random_state=42, n_jobs=workers)
print((-np.mean(cross_val_score(forest, X, y, scoring="neg_mean_squared_error", cv=5))) ** 0.5)

15.644247254387942


Леса тоже не улучшили результат, так что наилучшая модель - Ridge регрессия с коэффициентом 0.1

In [53]:
data_test = pd.read_csv("./nlp/test.csv", encoding="cp1252")

In [54]:
data_test.isna().mean(axis=0)

Id              0.000000
Hotel_name      0.000000
Review_Title    0.088861
Review_Text     0.000000
dtype: float64

In [55]:
test = data_test.Review_Text
test1 = test.apply(lambda x: re.findall(r"\b[A-Za-z]+\b", x))

In [56]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
test_reduced = test1.apply(lambda x: [word for word in x if word not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\poisk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
tfidf_vectorizer =  TfidfVectorizer(ngram_range=(1, 2)).fit(texts_final)
X = tfidf_vectorizer.transform(texts_final)
X_test = tfidf_vectorizer.transform(test_reduced.str.join(" "))

In [58]:
ridge = Ridge(alpha=0.1)
ridge.fit(X, y)
prediction = ridge.predict(X_test)

In [59]:
result = pd.DataFrame({"Id": data_test.Id, "Rating": prediction})

In [60]:
result

Unnamed: 0,Id,Rating
0,2351,97.071864
1,2352,67.822764
2,2353,79.717484
3,2354,37.791663
4,2355,88.017940
...,...,...
2347,4698,70.715455
2348,4699,67.879933
2349,4700,51.795172
2350,4701,43.407403


Лучший RMSE: 13.9101254629332