In [201]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk import wordnet, pos_tag, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [119]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [3]:
nltk.download('stopwords')
sw_eng = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **Датасет**

In [151]:
train = pd.read_csv('train_1.csv', encoding='unicode_escape')
test = pd.read_csv('test_1.csv', encoding='unicode-escape')

In [152]:
train.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


In [153]:
train.isna().sum()

Id                0
Hotel_name        0
Review_Title    215
Review_Text       0
Rating            0
dtype: int64

In [154]:
train = train.dropna()

In [155]:
train.isna().sum()

Id              0
Hotel_name      0
Review_Title    0
Review_Text     0
Rating          0
dtype: int64

In [156]:
re.sub(r'\s+', '', train['Review_Text'][1])

'Verycomfortableandfeltsafe.Staffwereveryhelpfulandrespectful.Breakfastofferedawidechoicewhichcarteredforeverypalate,startedearlyandfinishedlate.'

In [157]:
re.sub(' +', '', train['Review_Text'][1])

'Verycomfortableandfeltsafe.\r\nStaffwereveryhelpfulandrespectful.Breakfastofferedawidechoicewhichcarteredforeverypalate,startedearlyandfinishedlate.'

In [158]:
def f(f):
  f = f.lower()
  f = re.sub('[^a-z]', ' ', f)
  f = re.sub('\s+', ' ', f)
  f = ' '.join([word for word in f.split() if not word in sw_eng])
  return f

In [159]:
train['Review_Title_f'] = train['Review_Title'].apply(f)
train['Review_Text_f'] = train['Review_Text'].apply(f)

In [160]:
train.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Review_Title_f,Review_Text_f
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0,refuge chennai,excellent room exercise facility around atmosp...
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0,hilton chennai,comfortable felt safe staff helpful respectful...
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0,worth rating shown websites pricing ok,worth rating shown service good room well main...
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0,good stay,first nice courteous staff one con stay time c...
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0,needs improvement,overall ambience hotel good room facilities ne...


In [161]:
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN

def my_lemmatizer(sent):
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])

In [164]:
train['Review_Title_fl'] = train['Review_Title_f'].apply(my_lemmatizer)
train['Review_Text_fl'] = train['Review_Text_f'].apply(my_lemmatizer)

In [165]:
train.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Review_Title_f,Review_Text_f,Review_Title_fl,Review_Text_fl
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0,refuge chennai,excellent room exercise facility around atmosp...,refuge chennai,excellent room exercise facility around atmosp...
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0,hilton chennai,comfortable felt safe staff helpful respectful...,hilton chennai,comfortable felt safe staff helpful respectful...
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0,worth rating shown websites pricing ok,worth rating shown service good room well main...,worth rating show website price ok,worth rating show service good room well maint...
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0,good stay,first nice courteous staff one con stay time c...,good stay,first nice courteous staff one con stay time c...
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0,needs improvement,overall ambience hotel good room facilities ne...,need improvement,overall ambience hotel good room facility need...


In [166]:
with open('positive-words.txt') as p_w:
  p_w = p_w.read()

with open('negative-words.txt') as n_w:
  n_w = n_w.read()

In [167]:
def f_pos(f):
  f = [word for word in f.split() if word in p_w]
  return f

def f_neg(f):
  f = [word for word in f.split() if word in n_w]
  return f

In [168]:
train['Text_pos'] = train['Review_Text_fl'].apply(f_pos)
train['Text_neg'] = train['Review_Text_fl'].apply(f_neg)

In [169]:
train.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Review_Title_f,Review_Text_f,Review_Title_fl,Review_Text_fl,Text_pos,Text_neg
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0,refuge chennai,excellent room exercise facility around atmosp...,refuge chennai,excellent room exercise facility around atmosp...,"[excellent, room, around, calm, comfortable, r...","[comfortable, service, avoid, stick]"
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0,hilton chennai,comfortable felt safe staff helpful respectful...,hilton chennai,comfortable felt safe staff helpful respectful...,"[comfortable, felt, safe, helpful, respectful,...","[comfortable, safe, helpful, respectful, start..."
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0,worth rating shown websites pricing ok,worth rating shown service good room well main...,worth rating show website price ok,worth rating show service good room well maint...,"[worth, rating, good, room, well, room, spacio...","[worth, rating, show, service, well, poor, wor..."
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0,good stay,first nice courteous staff one con stay time c...,good stay,first nice courteous staff one con stay time c...,"[first, nice, courteous, one, con, time, time,...","[courteous, one, con, time, check, night, inap..."
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0,needs improvement,overall ambience hotel good room facilities ne...,need improvement,overall ambience hotel good room facility need...,"[good, room, improve, provide, per, room, prov...","[need, need, service, per, person]"


In [82]:
W = set()
for i in train['Text_pos']:
  i = str(i)
  i = i[1:-1]
  f=i.split("', '")
  W.update(set(f))
print(W)

{'look', '', "'prefer", 'dear', "'bar", 'adv', "j'", "free'", 'efficient', "'long", 'free', 'dead', 'fortunate', 'complimentary', 'hole', "ac'", "'venture", "rude'", "'pleasant", 'found', 'popular', 'thro', "'call", "oven'", 'damaged', 'fully', 'room', "impress'", "reason'", "'enjoy'", "effect'", 'run', "'happy'", 'age', 'viable', 'adequate', "supportive'", "enjoy'", "'satisfy", 'tell', 'speedy', 'f', 'sport', "'fantastic", 'ull', 'improve', 'lite', 'wonderful', 'except', 'love', 'deal', "condition'", "'air", 'unlimited', 'h', "picture'", 'usp', "thought'", 'operative', 'dent', 'responsive', 'favour', "'quiet", 'happily', "ate'", 'bless', 'come', 'question', 'lan', "'attraction", "deal'", "'par", 'term', 'tag', "comfort'", 'lean', 'nic', 'pas', 'compliment', 'lively', "'access", 'advance', "popular'", 'top', "one'", "'lift", 'comprehensive', "'peaceful", 'soft', "'rude", 'rate', "memorable'", "ideally'", 'politeness', "'city", 'super', "'decent", "cash'", "cost'", "'mid", "'extra", 'us

In [83]:
W = set()
for i in train['Text_neg']:
  i = str(i)
  i = i[1:-1]
  f=i.split("', '")
  W.update(set(f))
print(W)



In [170]:
train['Review_Title_Text'] = train['Review_Title_fl'] + ' ' + train['Review_Text_fl']

In [171]:
train['Review_Title_Text_pos'] = train['Review_Title_Text'].apply(f_pos)
train['Review_Title_Text_neg'] = train['Review_Title_Text'].apply(f_neg)

In [172]:
train.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Review_Title_f,Review_Text_f,Review_Title_fl,Review_Text_fl,Text_pos,Text_neg,Review_Title_Text,Review_Title_Text_pos,Review_Title_Text_neg
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0,refuge chennai,excellent room exercise facility around atmosp...,refuge chennai,excellent room exercise facility around atmosp...,"[excellent, room, around, calm, comfortable, r...","[comfortable, service, avoid, stick]",refuge chennai excellent room exercise facilit...,"[excellent, room, around, calm, comfortable, r...","[comfortable, service, avoid, stick]"
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0,hilton chennai,comfortable felt safe staff helpful respectful...,hilton chennai,comfortable felt safe staff helpful respectful...,"[comfortable, felt, safe, helpful, respectful,...","[comfortable, safe, helpful, respectful, start...",hilton chennai comfortable felt safe staff hel...,"[comfortable, felt, safe, helpful, respectful,...","[comfortable, safe, helpful, respectful, start..."
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0,worth rating shown websites pricing ok,worth rating shown service good room well main...,worth rating show website price ok,worth rating show service good room well maint...,"[worth, rating, good, room, well, room, spacio...","[worth, rating, show, service, well, poor, wor...",worth rating show website price ok worth ratin...,"[worth, rating, price, ok, worth, rating, good...","[worth, rating, show, price, ok, worth, rating..."
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0,good stay,first nice courteous staff one con stay time c...,good stay,first nice courteous staff one con stay time c...,"[first, nice, courteous, one, con, time, time,...","[courteous, one, con, time, check, night, inap...",good stay first nice courteous staff one con s...,"[good, first, nice, courteous, one, con, time,...","[courteous, one, con, time, check, night, inap..."
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0,needs improvement,overall ambience hotel good room facilities ne...,need improvement,overall ambience hotel good room facility need...,"[good, room, improve, provide, per, room, prov...","[need, need, service, per, person]",need improvement overall ambience hotel good r...,"[improvement, good, room, improve, provide, pe...","[need, need, need, service, per, person]"


In [103]:
train.head()

Unnamed: 0,Id,Hotel_name,Rating,Review_Title_fl,Review_Text_fl,Review_Title_Text,Review_Title_Text_pos,Review_Title_Text_neg
0,0,Park Hyatt,80.0,refuge chennai,excellent room exercise facility around atmosp...,refuge chennai excellent room exercise facilit...,"[excellent, room, around, calm, comfortable, r...","[comfortable, service, avoid, stick]"
1,1,Hilton Chennai,100.0,hilton chennai,comfortable felt safe staff helpful respectful...,hilton chennai comfortable felt safe staff hel...,"[comfortable, felt, safe, helpful, respectful,...","[comfortable, safe, helpful, respectful, start..."
2,2,The Royal Regency,71.0,worth rating show website price ok,worth rating show service good room well maint...,worth rating show website price ok worth ratin...,"[worth, rating, price, ok, worth, rating, good...","[worth, rating, show, price, ok, worth, rating..."
3,3,Rivera,86.0,good stay,first nice courteous staff one con stay time c...,good stay first nice courteous staff one con s...,"[good, first, nice, courteous, one, con, time,...","[courteous, one, con, time, check, night, inap..."
4,4,Park Hyatt,86.0,need improvement,overall ambience hotel good room facility need...,need improvement overall ambience hotel good r...,"[improvement, good, room, improve, provide, pe...","[need, need, need, service, per, person]"


In [104]:
W = set()
for i in train['Review_Title_Text_pos']:
  i = str(i)
  i = i[1:-1]
  f=i.split("', '")
  W.update(set(f))
print(W)

{'look', '', "'prefer", 'dear', "'bar", 'adv', "j'", "free'", 'efficient', 'fortunate', 'free', 'dead', "rude'", 'complimentary', 'hole', "ac'", "'pleasant", 'found', 'popular', 'thro', "'call", "oven'", 'damaged', 'fully', "'path", 'room', "impress'", "reason'", "'forget", "effect'", 'run', 'age', 'viable', 'adequate', "supportive'", "enjoy'", "'satisfy", 'tell', 'speedy', 'f', 'sport', "'fantastic", 'ull', "'worthy", 'improve', 'lite', 'wonderful', 'except', 'love', "'issue", 'deal', "condition'", "'air", 'unlimited', 'h', "picture'", 'usp', "thought'", 'operative', "'star", 'dent', 'son', 'responsive', 'favour', "'quiet", 'happily', "ate'", 'bless', 'come', 'question', 'lan', "deal'", "'par", 'term', 'tag', "comfort'", 'lean', 'nic', 'pas', 'compliment', 'lively', "'access", 'advance', "popular'", 'top', "one'", 'comprehensive', "'peaceful", "'rude", 'soft', 'rate', "memorable'", 'cheaper', "ideally'", 'politeness', "'city", 'super', "'decent", "cash'", "cost'", "'extra", 'useful', 

In [105]:
W = set()
for i in train['Review_Title_Text_neg']:
  i = str(i)
  i = i[1:-1]
  f=i.split("', '")
  W.update(set(f))
print(W)



In [173]:
def f_count(f):
  return len(f)

In [174]:
train['Pos_count'] = train['Review_Title_Text_pos'].apply(f_count)
train['Neg_count'] = train['Review_Title_Text_neg'].apply(f_count)

In [175]:
train.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,Review_Title_f,Review_Text_f,Review_Title_fl,Review_Text_fl,Text_pos,Text_neg,Review_Title_Text,Review_Title_Text_pos,Review_Title_Text_neg,Pos_count,Neg_count
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0,refuge chennai,excellent room exercise facility around atmosp...,refuge chennai,excellent room exercise facility around atmosp...,"[excellent, room, around, calm, comfortable, r...","[comfortable, service, avoid, stick]",refuge chennai excellent room exercise facilit...,"[excellent, room, around, calm, comfortable, r...","[comfortable, service, avoid, stick]",8,4
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0,hilton chennai,comfortable felt safe staff helpful respectful...,hilton chennai,comfortable felt safe staff helpful respectful...,"[comfortable, felt, safe, helpful, respectful,...","[comfortable, safe, helpful, respectful, start...",hilton chennai comfortable felt safe staff hel...,"[comfortable, felt, safe, helpful, respectful,...","[comfortable, safe, helpful, respectful, start...",7,7
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0,worth rating shown websites pricing ok,worth rating shown service good room well main...,worth rating show website price ok,worth rating show service good room well maint...,"[worth, rating, good, room, well, room, spacio...","[worth, rating, show, service, well, poor, wor...",worth rating show website price ok worth ratin...,"[worth, rating, price, ok, worth, rating, good...","[worth, rating, show, price, ok, worth, rating...",16,14
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0,good stay,first nice courteous staff one con stay time c...,good stay,first nice courteous staff one con stay time c...,"[first, nice, courteous, one, con, time, time,...","[courteous, one, con, time, check, night, inap...",good stay first nice courteous staff one con s...,"[good, first, nice, courteous, one, con, time,...","[courteous, one, con, time, check, night, inap...",12,9
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0,needs improvement,overall ambience hotel good room facilities ne...,need improvement,overall ambience hotel good room facility need...,"[good, room, improve, provide, per, room, prov...","[need, need, service, per, person]",need improvement overall ambience hotel good r...,"[improvement, good, room, improve, provide, pe...","[need, need, need, service, per, person]",9,6


In [176]:
X = train.drop(columns='Rating')
y = train['Rating']

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)

# **Модель**

**CountVectorizer**

In [209]:
c_v = CountVectorizer(max_df=0.75)
X_train_c_v = c_v.fit_transform(X_train['Review_Text_fl'])
X_test_c_v = c_v.transform(X_test['Review_Text_fl'])

rfc = RandomForestClassifier()
rfc.fit(X_train_c_v, y_train)
y_pred = rfc.predict(X_test_c_v)
print(accuracy_score(y_test, y_pred))

0.5350467289719626


In [214]:
c_v = CountVectorizer(max_df=0.75)
X_train_c_v = c_v.fit_transform(X_train['Review_Title_Text'])
X_test_c_v = c_v.transform(X_test['Review_Title_Text'])

rfc = RandomForestClassifier()
rfc.fit(X_train_c_v, y_train)
y_pred = rfc.predict(X_test_c_v)
print(accuracy_score(y_test, y_pred))

0.5420560747663551


In [213]:
rfc = RandomForestClassifier()
rfc.fit(X_train.drop(columns=['Id', 'Hotel_name', 'Review_Title', 'Review_Text', 'Review_Title_f', 'Review_Text_f', 'Review_Title_fl', 'Review_Text_fl', 'Text_pos', 'Text_neg', 'Review_Title_Text', 'Review_Title_Text_pos', 'Review_Title_Text_neg']), y_train)
y_pred = rfc.predict(X_test.drop(columns=['Id', 'Hotel_name', 'Review_Title', 'Review_Text', 'Review_Title_f', 'Review_Text_f', 'Review_Title_fl', 'Review_Text_fl', 'Text_pos', 'Text_neg', 'Review_Title_Text', 'Review_Title_Text_pos', 'Review_Title_Text_neg']))
print(accuracy_score(y_test, y_pred))

0.26401869158878505


**TfidfVectorizer**

In [219]:
tv = TfidfVectorizer(max_df=0.75)
X_train_tv= tv.fit_transform(X_train['Review_Title_Text'])
X_test_tv = tv.transform(X_test['Review_Title_Text'])

rfc = RandomForestClassifier()
rfc.fit(X_train_tv, y_train)
y_pred_tv = rfc.predict(X_test_tv)
print(accuracy_score(y_test, y_pred_tv))

0.530373831775701


**CountVectorizer лучше**

**Переберем параметры RandomForestClassifier**

In [220]:
c_v = CountVectorizer(max_df=0.75)
X_train_c_v = c_v.fit_transform(X_train['Review_Title_Text'])
X_test_c_v = c_v.transform(X_test['Review_Title_Text'])

In [221]:
param = {'max_depth':range(1, 16,2), 'min_samples_leaf':range(10, 51, 10), 'n_estimators':range(10, 101, 10)}

hgs_forest = HalvingGridSearchCV(RandomForestClassifier(), param, scoring = 'r2', cv = 5)
hgs_forest.fit(X_train_c_v, y_train)
print(hgs_forest.best_estimator_)
print(hgs_forest.best_score_)

RandomForestClassifier(max_depth=7, min_samples_leaf=50, n_estimators=70)
-0.40448616885001626


In [227]:
rfc = RandomForestClassifier(max_depth=7, min_samples_leaf=50, n_estimators=70)
rfc.fit(X_train_c_v, y_train)
y_pred = rfc.predict(X_test_c_v)
print(accuracy_score(y_test, y_pred))

0.24065420560747663


**Лучший вариант**

In [236]:
c_v = CountVectorizer(max_df=0.75)
X_train_c_v = c_v.fit_transform(X_train['Review_Title_Text'])
X_test_c_v = c_v.transform(X_test['Review_Title_Text'])

rfc = RandomForestClassifier()
rfc.fit(X_train_c_v, y_train)
y_pred = rfc.predict(X_test_c_v)
print(accuracy_score(y_test, y_pred))

0.5537383177570093


**Делаем на test**

In [247]:
test = test.dropna()

In [248]:
test['Review_Title_f'] = test['Review_Title'].apply(f)
test['Review_Text_f'] = test['Review_Text'].apply(f)

In [249]:
test['Review_Title_fl'] = test['Review_Title_f'].apply(my_lemmatizer)
test['Review_Text_fl'] = test['Review_Text_f'].apply(my_lemmatizer)

In [251]:
test['Review_Title_Text'] = test['Review_Title_fl'] + ' ' + test['Review_Text_fl']

In [254]:
c_v = CountVectorizer(max_df=0.75)
X_train_c_v = c_v.fit_transform(X_train['Review_Title_Text'])
X_test_c_v = c_v.transform(test['Review_Title_Text'])

rfc = RandomForestClassifier()
rfc.fit(X_train_c_v, y_train)
y_pred = rfc.predict(X_test_c_v)

In [261]:
s = pd.DataFrame.from_dict({'Id':test['Id'], 'Rating': y_pred})
s.head()

Unnamed: 0,Id,Rating
0,2351,100.0
2,2353,86.0
3,2354,40.0
4,2355,86.0
5,2356,100.0
