# Домашняя работа

In [73]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [29]:
df = pd.read_csv('train.csv', encoding='cp1252')
df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


In [30]:
df.isna().sum()

Id                0
Hotel_name        0
Review_Title    215
Review_Text       0
Rating            0
dtype: int64

In [31]:
print('Количество различных отелей:' , df['Hotel_name'].nunique())
print('Общее количество отзывов: ', df['Hotel_name'].size)

Количество различных отелей: 243
Общее количество отзывов:  2351


Выше я считал кол-во уникальных отелей, подозревая, что это категориальный признак и его не нужно предобрабатывать, а просто сделать One-hot Encoding. Но 243 уникальных значения - это слишком много, поэтому буду работать со всеми колонками.

Уберём пунктуацию из предложений. Но можно её не терять, а создать отдельную колонку с количеством знаков препинания в отзыве.

In [32]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [33]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = str(text).replace(punctuation, '')
    return text

def count_punctuations(text):
    counter = 0
    for symbol in text:
        if symbol in string.punctuation:
            counter += 1
    return counter

df["Punctuation"] = df['Review_Text'].apply(count_punctuations)
df["Review_Text"] = df['Review_Text'].apply(remove_punctuations)
df["Review_Title"] = df['Review_Title'].apply(remove_punctuations)
df["Hotel_name"] = df['Hotel_name'].apply(remove_punctuations)

df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility All aroun...,80.0,5
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe \r\nStaff were ...,100.0,4
2,2,The Royal Regency,No worth the rating shown in websites Pricing ...,Not worth the rating shown Service is not good...,71.0,8
3,3,Rivera,Good stay,First of all nice courteous staff only one co...,86.0,6
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good In ...,86.0,4


##### Токенизация. Разделим на слова.

In [34]:
import re
def split_text(text):
    expr = r'[^(\w.\w)\w\s]'
    parser=re.compile(expr)
    tmp_string = parser.sub(r'', text)
    return tmp_string.split()

df["Review_Text"] = df['Review_Text'].apply(split_text)
df["Review_Title"] = df['Review_Title'].apply(split_text)
df["Hotel_name"] = df['Hotel_name'].apply(split_text)

df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation
0,0,"[Park, Hyatt]","[Refuge, in, Chennai]","[Excellent, room, and, exercise, facility, All...",80.0,5
1,1,"[Hilton, Chennai]","[Hilton, Chennai]","[Very, comfortable, and, felt, safe, Staff, we...",100.0,4
2,2,"[The, Royal, Regency]","[No, worth, the, rating, shown, in, websites, ...","[Not, worth, the, rating, shown, Service, is, ...",71.0,8
3,3,[Rivera],"[Good, stay]","[First, of, all, nice, courteous, staff, only,...",86.0,6
4,4,"[Park, Hyatt]","[Needs, improvement]","[Overall, ambience, of, the, hotel, is, very, ...",86.0,4


##### Приведём все слова к нижнему регистру

In [35]:
def to_lower(text):
    text = [word.lower() for word in text]
    return text

df["Review_Text"] = df['Review_Text'].apply(to_lower)
df["Review_Title"] = df['Review_Title'].apply(to_lower)
df["Hotel_name"] = df['Hotel_name'].apply(to_lower)

df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation
0,0,"[park, hyatt]","[refuge, in, chennai]","[excellent, room, and, exercise, facility, all...",80.0,5
1,1,"[hilton, chennai]","[hilton, chennai]","[very, comfortable, and, felt, safe, staff, we...",100.0,4
2,2,"[the, royal, regency]","[no, worth, the, rating, shown, in, websites, ...","[not, worth, the, rating, shown, service, is, ...",71.0,8
3,3,[rivera],"[good, stay]","[first, of, all, nice, courteous, staff, only,...",86.0,6
4,4,"[park, hyatt]","[needs, improvement]","[overall, ambience, of, the, hotel, is, very, ...",86.0,4


До удаления стоп-слов я решил, на всякий случай запомнить количество слов в отзыве. (Возможно, это тоже коррелирует с таргетом)

In [36]:
df["Length"] = df['Review_Text'].apply(lambda x: len(x))
df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation,Length
0,0,"[park, hyatt]","[refuge, in, chennai]","[excellent, room, and, exercise, facility, all...",80.0,5,31
1,1,"[hilton, chennai]","[hilton, chennai]","[very, comfortable, and, felt, safe, staff, we...",100.0,4,26
2,2,"[the, royal, regency]","[no, worth, the, rating, shown, in, websites, ...","[not, worth, the, rating, shown, service, is, ...",71.0,8,43
3,3,[rivera],"[good, stay]","[first, of, all, nice, courteous, staff, only,...",86.0,6,39
4,4,"[park, hyatt]","[needs, improvement]","[overall, ambience, of, the, hotel, is, very, ...",86.0,4,44


##### Удаление стоп-слов

In [37]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
from nltk.corpus import stopwords
sw_eng = set(stopwords.words('english'))

def del_stopwords(text):
    text = [word for word in text if word not in sw_eng]
    return text

df["Review_Text"] = df['Review_Text'].apply(del_stopwords)
df["Review_Title"] = df['Review_Title'].apply(del_stopwords)
df["Hotel_name"] = df['Hotel_name'].apply(del_stopwords)

df.head()


Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation,Length
0,0,"[park, hyatt]","[refuge, chennai]","[excellent, room, exercise, facility, around, ...",80.0,5,31
1,1,"[hilton, chennai]","[hilton, chennai]","[comfortable, felt, safe, staff, helpful, resp...",100.0,4,26
2,2,"[royal, regency]","[worth, rating, shown, websites, pricing, ok]","[worth, rating, shown, service, good, room, we...",71.0,8,43
3,3,[rivera],"[good, stay]","[first, nice, courteous, staff, one, con, stay...",86.0,6,39
4,4,"[park, hyatt]","[needs, improvement]","[overall, ambience, hotel, good, room, facilit...",86.0,4,44


##### Применим Лемматизацию

In [39]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [40]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def lemmatize_text(text):
    sentence = ' '.join(text)
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return lemmatized_sentence


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alexe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [41]:
df["Review_Text"] = df['Review_Text'].apply(lemmatize_text)
df["Review_Title"] = df['Review_Title'].apply(lemmatize_text)
df["Hotel_name"] = df['Hotel_name'].apply(lemmatize_text)

df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation,Length
0,0,"[park, hyatt]","[refuge, chennai]","[excellent, room, exercise, facility, around, ...",80.0,5,31
1,1,"[hilton, chennai]","[hilton, chennai]","[comfortable, felt, safe, staff, helpful, resp...",100.0,4,26
2,2,"[royal, regency]","[worth, rating, show, website, price, ok]","[worth, rating, show, service, good, room, wel...",71.0,8,43
3,3,[rivera],"[good, stay]","[first, nice, courteous, staff, one, con, stay...",86.0,6,39
4,4,"[park, hyatt]","[need, improvement]","[overall, ambience, hotel, good, room, facilit...",86.0,4,44


In [42]:
df["Review_Text"] = df['Review_Text'].apply(lambda x: ' '.join(x))
df["Review_Title"] = df['Review_Title'].apply(lambda x: ' '.join(x))
df["Hotel_name"] = df['Hotel_name'].apply(lambda x: ' '.join(x))

df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation,Length
0,0,park hyatt,refuge chennai,excellent room exercise facility around atmosp...,80.0,5,31
1,1,hilton chennai,hilton chennai,comfortable felt safe staff helpful respectful...,100.0,4,26
2,2,royal regency,worth rating show website price ok,worth rating show service good room well maint...,71.0,8,43
3,3,rivera,good stay,first nice courteous staff one con stay time c...,86.0,6,39
4,4,park hyatt,need improvement,overall ambience hotel good room facility need...,86.0,4,44


##### Слияние всего текста в одну колонку

In [43]:
df['Full_text'] = df['Hotel_name'] + ' ' + df['Review_Title']+ ' ' + df['Review_Text']
df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation,Length,Full_text
0,0,park hyatt,refuge chennai,excellent room exercise facility around atmosp...,80.0,5,31,park hyatt refuge chennai excellent room exerc...
1,1,hilton chennai,hilton chennai,comfortable felt safe staff helpful respectful...,100.0,4,26,hilton chennai hilton chennai comfortable felt...
2,2,royal regency,worth rating show website price ok,worth rating show service good room well maint...,71.0,8,43,royal regency worth rating show website price ...
3,3,rivera,good stay,first nice courteous staff one con stay time c...,86.0,6,39,rivera good stay first nice courteous staff on...
4,4,park hyatt,need improvement,overall ambience hotel good room facility need...,86.0,4,44,park hyatt need improvement overall ambience h...


### Представление текста

In [44]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

##### One-Hot encoding

In [45]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
X = label_encoder.fit_transform(df['Full_text'])

In [46]:
onehot_encoder = OneHotEncoder(sparse=False)
X = X.reshape(len(X), 1)
X = onehot_encoder.fit_transform(X)

y = df['Rating']

linreg = LinearRegression()

print('RMSE при использовании One-Hot encoding:',
      (-np.mean(cross_val_score(linreg, X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании One-Hot encoding: 63.90137285971259


###### Count Vectorizer

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['Full_text'])

print('RMSE при использовании Count Vectorizer:',
            (-np.mean(cross_val_score(linreg, X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании Count Vectorizer: 44.39519631742207


##### TF-IDF

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
idf_vectorizer = TfidfVectorizer()

In [49]:
X = idf_vectorizer.fit_transform(df['Full_text'])
linreg.fit(X, y)
pred = linreg.predict(X)
print('RMSE при использовании TF-IDF:',
    (-np.mean(cross_val_score(LinearRegression(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании TF-IDF: 22.036023206884618


Лучший результат показан при использовании TF-IDF для представления текста, поэтому в дальнейшем (при переборе моделей, гиперпараметров и т.д.) будем пользоваться этим способом. 

### Feature-Engineering

Посмотрим, какие будут результаты при добавлении ранее занесённых в датафрейм признаков.

In [50]:
X_dop_features = np.hstack((X.toarray() ,np.array(df['Punctuation']).reshape(len(df['Punctuation']), 1) ))

In [51]:
print('RMSE при использовании TF-IDF:',
    (-np.mean(cross_val_score(LinearRegression(), X_dop_features, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании TF-IDF: 22.153475787778177


Добавление столбца с количеством знаков препинания привело к небольшому ухудшению результата.

In [52]:
X_dop_features = np.hstack((X.toarray() ,np.array(df['Length']).reshape(len(df['Length']), 1) ))

In [53]:
print('RMSE при использовании TF-IDF:',
    (-np.mean(cross_val_score(LinearRegression(), X_dop_features, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании TF-IDF: 22.266463716707996


Добавление столбца с длинной отзыва тоже немного ухудшило результат.

Нормализуем эти два столбца и попробуем ещё раз:

In [54]:
df['Norm_Punctuation'] = (df['Punctuation']-df['Punctuation'].min())/(df['Punctuation'].max()-df['Punctuation'].min())
df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation,Length,Full_text,Norm_Punctuation
0,0,park hyatt,refuge chennai,excellent room exercise facility around atmosp...,80.0,5,31,park hyatt refuge chennai excellent room exerc...,0.014286
1,1,hilton chennai,hilton chennai,comfortable felt safe staff helpful respectful...,100.0,4,26,hilton chennai hilton chennai comfortable felt...,0.011429
2,2,royal regency,worth rating show website price ok,worth rating show service good room well maint...,71.0,8,43,royal regency worth rating show website price ...,0.022857
3,3,rivera,good stay,first nice courteous staff one con stay time c...,86.0,6,39,rivera good stay first nice courteous staff on...,0.017143
4,4,park hyatt,need improvement,overall ambience hotel good room facility need...,86.0,4,44,park hyatt need improvement overall ambience h...,0.011429


In [55]:
X_dop_features = np.hstack((X.toarray() ,np.array(df['Norm_Punctuation']).reshape(len(df['Norm_Punctuation']), 1) ))
print('RMSE при использовании TF-IDF:',
    (-np.mean(cross_val_score(LinearRegression(), X_dop_features, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании TF-IDF: 22.238729576034718


In [56]:
df['Norm_Length'] = (df['Length']-df['Length'].min())/(df['Length'].max()-df['Length'].min())
df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation,Length,Full_text,Norm_Punctuation,Norm_Length
0,0,park hyatt,refuge chennai,excellent room exercise facility around atmosp...,80.0,5,31,park hyatt refuge chennai excellent room exerc...,0.014286,0.348315
1,1,hilton chennai,hilton chennai,comfortable felt safe staff helpful respectful...,100.0,4,26,hilton chennai hilton chennai comfortable felt...,0.011429,0.292135
2,2,royal regency,worth rating show website price ok,worth rating show service good room well maint...,71.0,8,43,royal regency worth rating show website price ...,0.022857,0.483146
3,3,rivera,good stay,first nice courteous staff one con stay time c...,86.0,6,39,rivera good stay first nice courteous staff on...,0.017143,0.438202
4,4,park hyatt,need improvement,overall ambience hotel good room facility need...,86.0,4,44,park hyatt need improvement overall ambience h...,0.011429,0.494382


In [57]:
X_dop_features = np.hstack((X.toarray() ,np.array(df['Norm_Length']).reshape(len(df['Norm_Length']), 1) ))
print('RMSE при использовании TF-IDF:',
    (-np.mean(cross_val_score(LinearRegression(), X_dop_features, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании TF-IDF: 21.99868763295044


При добавлении номализованного столбца "длина" результат немного улучшился. Поэтому добавим этот признак в X:

In [58]:
X = np.hstack((X.toarray() ,np.array(df['Norm_Length']).reshape(len(df['Norm_Length']), 1) ))

Посчитаем кол-во положительных и отрицательных слов в отзыве:

In [62]:
with open('positive-words.txt', 'r') as pos_file:
    positive_list = pos_file.read().split()

with open('negative-words.txt', 'r') as neg_file:
    negative_list = neg_file.read().split()

In [70]:
def count_pos_words(text):
    counter = 0
    for word in text.split():
        if word in positive_list:
            counter += 1
    return counter

def count_neg_words(text):
    counter = 0
    for word in text.split():
        if word in negative_list:
            counter += 1
    return counter

df["Positive_words"] = df['Full_text'].apply(count_pos_words)
df["Negative_words"] = df['Full_text'].apply(count_neg_words)

#Нормализую
df['Positive_words'] = (df['Positive_words']-df['Positive_words'].min())/(df['Positive_words'].max()-df['Positive_words'].min())
df['Negative_words'] = (df['Negative_words']-df['Negative_words'].min())/(df['Negative_words'].max()-df['Negative_words'].min())


df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Punctuation,Length,Full_text,Norm_Punctuation,Norm_Length,Positive_words,Negative_words
0,0,park hyatt,refuge chennai,excellent room exercise facility around atmosp...,80.0,5,31,park hyatt refuge chennai excellent room exerc...,0.014286,0.348315,0.222222,0.0
1,1,hilton chennai,hilton chennai,comfortable felt safe staff helpful respectful...,100.0,4,26,hilton chennai hilton chennai comfortable felt...,0.011429,0.292135,0.222222,0.0
2,2,royal regency,worth rating show website price ok,worth rating show service good room well maint...,71.0,8,43,royal regency worth rating show website price ...,0.022857,0.483146,0.5,0.090909
3,3,rivera,good stay,first nice courteous staff one con stay time c...,86.0,6,39,rivera good stay first nice courteous staff on...,0.017143,0.438202,0.277778,0.181818
4,4,park hyatt,need improvement,overall ambience hotel good room facility need...,86.0,4,44,park hyatt need improvement overall ambience h...,0.011429,0.494382,0.166667,0.0


In [71]:
X_dop_features = np.hstack((X ,np.array(df['Positive_words']).reshape(len(df['Positive_words']), 1) ))
X_dop_features = np.hstack((X_dop_features ,np.array(df['Negative_words']).reshape(len(df['Negative_words']), 1) ))
print('RMSE при использовании TF-IDF:',
    (-np.mean(cross_val_score(LinearRegression(), X_dop_features, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании TF-IDF: 22.009265843636122


Результат ухудшился, поэтому оставим всё в прежнем виде.

### Выбор модели и подбор гиперпараметров:

##### Линейная регрессия со стохастическим градиентным спуском:

In [74]:
from sklearn.linear_model import SGDRegressor
 
print('RMSE при использовании SGDRegressor:',
    (-np.mean(cross_val_score(SGDRegressor(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании SGDRegressor: 14.468227157482058


Мы видим, что SGDRegressor показал результат намного лучше чем обычная линейная регрессия. Поэтому параметры переберём только для sgd.

In [84]:
import itertools

penalty_list = ["l2","l1", "elasticnet"]
loss_list = ["squared_loss", "huber", "epsilon_insensitive"]
for penalty in penalty_list:
    for loss in loss_list:
        print([loss, penalty, (-np.mean(cross_val_score(SGDRegressor(loss=loss, penalty=penalty, max_iter=100), X, y, 
                                                   scoring="neg_mean_squared_error", cv=5)))**0.5])

['squared_loss', 'l2', 16.3979668753115]
['huber', 'l2', 62.09813284900803]
['epsilon_insensitive', 'l2', 21.63206413588306]
['squared_loss', 'l1', 16.382778476129513]
['huber', 'l1', 62.12818311046043]
['epsilon_insensitive', 'l1', 21.645569982380493]
['squared_loss', 'elasticnet', 16.394087149034057]
['huber', 'elasticnet', 62.10338756010017]
['epsilon_insensitive', 'elasticnet', 21.63438507710914]


Лучшие параметры - squared_loss и l1-регуляризация.

In [89]:
for learning_rate in ['constant', 'optimal', 'invscaling', 'adaptive']:
    print([learning_rate, (-np.mean(cross_val_score(SGDRegressor(loss='squared_loss', penalty='l1', learning_rate = learning_rate, 
                                                                 max_iter=100), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5])


['constant', 14.159879301316908]
['optimal', 101174610043.2554]
['invscaling', 16.381997725605256]
['adaptive', 14.179294176456155]


Лучший параметр для learning rate - constant

In [93]:
print("Итоговый результат для модели линейной регрессии:",(-np.mean(cross_val_score(SGDRegressor(loss='squared_loss', penalty='l1', learning_rate = 'constant', 
                                                                 max_iter=100), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)


Итоговый результат для модели линейной регрессии: 14.17458893580358


##### Решающие деревья

In [85]:
from sklearn.tree import DecisionTreeRegressor

print('RMSE при использовании DecisionTreeRegressor:',
    (-np.mean(cross_val_score(DecisionTreeRegressor(), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании DecisionTreeRegressor: 18.489858746112432


##### Случайный лес:

In [86]:
from sklearn.ensemble import RandomForestRegressor

print('RMSE при использовании RandomForestRegressor:',
    (-np.mean(cross_val_score(RandomForestRegressor(n_estimators=50), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании RandomForestRegressor: 14.073800621402


In [88]:
import catboost as cb

print('RMSE при использовании RandomForestRegressor:',
    (-np.mean(cross_val_score(cb.CatBoostRegressor(n_estimators=50, verbose=False), X, y, 
                              scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании RandomForestRegressor: 15.009477264467717


RandomForestRegressor в данном случае (без подбора гиперпараметров) работает лучше чем CatBoostRegressor. Но так как он намного дольше, я переберу параметру на CatBoost.

In [95]:
learning_rate_list = [ 0.1, 0.2, 0.3, 0.5, 0.7, 0.9]

for i in learning_rate_list:
    print('lr=', i, ' rmse=' , (-np.mean(cross_val_score(cb.CatBoostRegressor(n_estimators=50, verbose=False, learning_rate=i), X, y, 
                              scoring="neg_mean_squared_error", cv=5)))**0.5)

lr= 0.1  rmse= 16.144631242787913
lr= 0.2  rmse= 15.482200456005948
lr= 0.3  rmse= 15.242550725348332
lr= 0.5  rmse= 15.009477264467717
lr= 0.7  rmse= 15.00904986802074
lr= 0.9  rmse= 15.257844150943361


In [96]:
depth_list = [4, 5, 6, 7, 8, 9, 10]

for i in depth_list:
    print('max_depth=', i, ' rmse=', (-np.mean(cross_val_score(cb.CatBoostRegressor(n_estimators=50, verbose=False, depth=i), X, y, 
                              scoring="neg_mean_squared_error", cv=5)))**0.5)

max_depth= 4  rmse= 15.413204163753791
max_depth= 5  rmse= 15.137000617121119
max_depth= 6  rmse= 15.009477264467717
max_depth= 7  rmse= 14.863997540045101
max_depth= 8  rmse= 14.672689037688675
max_depth= 9  rmse= 14.563269803950776
max_depth= 10  rmse= 14.626380364234764


In [97]:
l2_leaf_reg_list = [0.1, 0.2, 0.3, 0.5, 0.7, 1, 2, 3]

for i in l2_leaf_reg_list:
    print('l2_leaf_reg=', i, ' rmse=' , (-np.mean(cross_val_score(cb.CatBoostRegressor(n_estimators=50, verbose=False, l2_leaf_reg=i), X, y, 
                              scoring="neg_mean_squared_error", cv=5)))**0.5)

l2_leaf_reg= 0.1  rmse= 17.707055302886445
l2_leaf_reg= 0.2  rmse= 17.713098907050004
l2_leaf_reg= 0.3  rmse= 17.724266537046745
l2_leaf_reg= 0.5  rmse= 17.742025792750265
l2_leaf_reg= 0.7  rmse= 17.771489272337835
l2_leaf_reg= 1  rmse= 17.818828891080067
l2_leaf_reg= 2  rmse= 17.86658500121832
l2_leaf_reg= 3  rmse= 17.91324650512669


In [103]:
bootstrap_type_list = ['Bayesian' ,'Bernoulli' ,'MVS' ,'No']

for i in bootstrap_type_list:
    print('bootstrap_type=', i, ' rmse=' , (-np.mean(cross_val_score(cb.CatBoostRegressor(bootstrap_type=i,n_estimators=50,depth=9, 
                                                                                          verbose=False), 
                                                                     X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

bootstrap_type= Bayesian  rmse= 14.835920407511292
bootstrap_type= Bernoulli  rmse= 14.541478628846484
bootstrap_type= MVS  rmse= 14.563269803950776
bootstrap_type= No  rmse= 14.692186997123546


Гиперпараметр, который улучшил результат на CatBoost - это depth=9. Посмотрим, какой результат покажет RandomForest с этим параметром.


In [106]:
print('RMSE при использовании RandomForestRegressor:',
    (-np.mean(cross_val_score(RandomForestRegressor(n_estimators=50, max_depth=9), X, y, scoring="neg_mean_squared_error", cv=5)))**0.5)

RMSE при использовании RandomForestRegressor: 16.21733196420093


### Итог:


Лучший результат RMSE = 14.07 (на RandomForestRegressor с параметрами по умолчанию). Но практически такой же результат (14.17) на линейной регрессии после подбора гиперпараметров.