In [16]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk import wordnet, pos_tag, WordNetLemmatizer, ngrams
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
df = pd.read_csv('nlp/train.csv', encoding='unicode_escape')
df

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0
...,...,...,...,...,...
2346,2346,Hyatt Regency Chennai,,Most impressive service by staff in all areas....,80.0
2347,2347,New Woodlands,Homely villa,New woodlands chennai which gave me a homely e...,71.0
2348,2348,Samudra Residency,Nice accommodation and facilities,Awesome I liked the neatness and maintenance. ...,100.0
2349,2349,The Residency Chennai,The Residency Good Centrally located Hotel,The overall experience was good. However the w...,80.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2351 non-null   int64  
 1   Hotel_name    2351 non-null   object 
 2   Review_Title  2136 non-null   object 
 3   Review_Text   2351 non-null   object 
 4   Rating        2351 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 92.0+ KB


In [19]:
df.isna().sum()

Id                0
Hotel_name        0
Review_Title    215
Review_Text       0
Rating            0
dtype: int64

Удалим все нерелевантные символы и слова из stopwords, переведем текст в нижний регистр,

In [20]:
def change_text(text):
    text = str(text)
    text = re.sub(r'[^a-zA-Z0-9]+', ' ', text).lower()
    if text[-1] == ' ' and len(text) > 1:
        text = text[:len(text) - 1]
    sw_eng = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if not word in sw_eng])
    return text


df['changed_title'] = df['Review_Title'].apply(change_text)
df['changed_text'] = df['Review_Text'].apply(change_text)
df

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,changed_title,changed_text
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0,refuge chennai,excellent room exercise facility around atmosp...
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0,hilton chennai,comfortable felt safe staff helpful respectful...
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0,worth rating shown websites pricing ok,worth rating shown service good room well main...
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0,good stay,first nice courteous staff one con stay time c...
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0,needs improvement,overall ambience hotel good room facilities ne...
...,...,...,...,...,...,...,...
2346,2346,Hyatt Regency Chennai,,Most impressive service by staff in all areas....,80.0,,impressive service staff areas good restaurant...
2347,2347,New Woodlands,Homely villa,New woodlands chennai which gave me a homely e...,71.0,homely villa,new woodlands chennai gave homely experience l...
2348,2348,Samudra Residency,Nice accommodation and facilities,Awesome I liked the neatness and maintenance. ...,100.0,nice accommodation facilities,awesome liked neatness maintenance facilities ...
2349,2349,The Residency Chennai,The Residency Good Centrally located Hotel,The overall experience was good. However the w...,80.0,residency good centrally located hotel,overall experience good however wi fi getting ...


Проведем лемматизацию текста

In [21]:
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN

def my_lemmatizer(sent):
  if sent!='nan':
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])  
  else:
    return '' 
  



In [22]:
df['changed_text'] = df['changed_text'].apply(my_lemmatizer)
df['changed_title'] = df['changed_title'].apply(my_lemmatizer)
df

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,changed_title,changed_text
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0,refuge chennai,excellent room exercise facility around atmosp...
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0,hilton chennai,comfortable felt safe staff helpful respectful...
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0,worth rating show website price ok,worth rating show service good room well maint...
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0,good stay,first nice courteous staff one con stay time c...
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0,need improvement,overall ambience hotel good room facility need...
...,...,...,...,...,...,...,...
2346,2346,Hyatt Regency Chennai,,Most impressive service by staff in all areas....,80.0,,impressive service staff area good restaurant ...
2347,2347,New Woodlands,Homely villa,New woodlands chennai which gave me a homely e...,71.0,homely villa,new woodland chennai give homely experience lu...
2348,2348,Samudra Residency,Nice accommodation and facilities,Awesome I liked the neatness and maintenance. ...,100.0,nice accommodation facility,awesome like neatness maintenance facility rea...
2349,2349,The Residency Chennai,The Residency Good Centrally located Hotel,The overall experience was good. However the w...,80.0,residency good centrally locate hotel,overall experience good however wi fi get disc...


Посмотрим кол-во позитивных и негативных слов в title и text

In [26]:
f = open('nlp/positive-words.txt','r')
positive_words = f.read().split('\n')
f.close()
f = open('nlp/negative-words.txt','r',encoding= 'unicode_escape')
negative_words = f.read().split('\n')
f.close()

def positive_count(text):
  return len([word for word in text.split() if word in positive_words])

def negative_count(text):
  return len([word for word in text.split() if word in negative_words])

In [27]:
df['positive word'] = df['changed_text'].apply(positive_count) + df['changed_title'].apply(positive_count)
df['negative word'] = df['changed_text'].apply(negative_count) + df['changed_title'].apply(negative_count)
df


Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating,changed_title,changed_text,positive word,negative word
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0,refuge chennai,excellent room exercise facility around atmosp...,4,0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0,hilton chennai,comfortable felt safe staff helpful respectful...,4,0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0,worth rating show website price ok,worth rating show service good room well maint...,9,1
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0,good stay,first nice courteous staff one con stay time c...,5,2
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0,need improvement,overall ambience hotel good room facility need...,3,0
...,...,...,...,...,...,...,...,...,...
2346,2346,Hyatt Regency Chennai,,Most impressive service by staff in all areas....,80.0,,impressive service staff area good restaurant ...,9,0
2347,2347,New Woodlands,Homely villa,New woodlands chennai which gave me a homely e...,71.0,homely villa,new woodland chennai give homely experience lu...,6,0
2348,2348,Samudra Residency,Nice accommodation and facilities,Awesome I liked the neatness and maintenance. ...,100.0,nice accommodation facility,awesome like neatness maintenance facility rea...,7,0
2349,2349,The Residency Chennai,The Residency Good Centrally located Hotel,The overall experience was good. However the w...,80.0,residency good centrally locate hotel,overall experience good however wi fi get disc...,2,1


посмотриим на ngrams

In [31]:
from collections import Counter
title = ' '.join(df['changed_title']).split()
ngrams_title = list(ngrams(title,n=2))
Counter(ngrams_title).most_common()[:20]


[(('good', 'hotel'), 140),
 (('stay', 'good'), 88),
 (('value', 'money'), 82),
 (('hotel', 'good'), 78),
 (('nice', 'hotel'), 77),
 (('good', 'stay'), 73),
 (('hotel', 'stay'), 70),
 (('place', 'stay'), 59),
 (('good', 'location'), 54),
 (('good', 'service'), 37),
 (('conversation', 'much'), 33),
 (('nice', 'stay'), 32),
 (('experience', 'good'), 31),
 (('money', 'good'), 30),
 (('stay', 'nice'), 30),
 (('service', 'good'), 29),
 (('good', 'experience'), 29),
 (('location', 'good'), 29),
 (('hotel', 'nice'), 28),
 (('great', 'hotel'), 26)]

In [32]:
title = ' '.join(df['changed_text']).split()
ngrams_text = list(ngrams(title,n=2))
Counter(ngrams_text).most_common()[:20]

[(('hotel', 'good'), 222),
 (('good', 'hotel'), 221),
 (('room', 'service'), 180),
 (('value', 'money'), 152),
 (('service', 'good'), 139),
 (('room', 'clean'), 126),
 (('stay', 'hotel'), 114),
 (('hotel', 'staff'), 110),
 (('good', 'location'), 105),
 (('food', 'good'), 100),
 (('good', 'room'), 99),
 (('room', 'good'), 92),
 (('hotel', 'room'), 92),
 (('good', 'service'), 92),
 (('location', 'good'), 91),
 (('good', 'food'), 89),
 (('also', 'good'), 83),
 (('staff', 'good'), 82),
 (('location', 'hotel'), 75),
 (('clean', 'room'), 71)]

разобьем на выборки

In [45]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings

In [107]:
df['text title'] = df['changed_text'] + df['changed_title']

In [108]:
X = df.drop(columns=['Review_Title', 'Review_Text','Id','Rating','Hotel_name'])
Y = df['Rating']

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [110]:
# посмотрим метрики на changed_title
count_vect_title = CountVectorizer()
X_train_title = count_vect_title.fit_transform(X_train['changed_title'])
X_test_title = count_vect_title.transform(X_test['changed_title'])

svm_title = SGDClassifier()
svm_title.fit(X_train_title, y_train)
predicted = svm_title.predict(X_test_title)
print('metrics on changed_title')
print(metrics.classification_report(y_test, predicted))



metrics on changed_title
              precision    recall  f1-score   support

        20.0       0.25      0.24      0.24        17
        29.0       0.18      0.17      0.17        12
        40.0       0.67      0.17      0.27        24
        43.0       0.67      0.10      0.17        21
        57.0       0.36      0.18      0.24        28
        60.0       0.15      0.45      0.22        31
        65.0       0.29      0.21      0.24        29
        71.0       0.33      0.51      0.40        63
        80.0       0.49      0.30      0.37        61
        86.0       0.52      0.60      0.55       102
       100.0       0.58      0.37      0.46        83

    accuracy                           0.38       471
   macro avg       0.41      0.30      0.30       471
weighted avg       0.45      0.38      0.38       471



In [111]:
# посмотрим метрики на changed_text
count_vect_text = CountVectorizer()
X_train_text = count_vect_text.fit_transform(X_train['changed_text'])
X_test_text = count_vect_text.transform(X_test['changed_text'])

svm_text = SGDClassifier()
svm_text.fit(X_train_text, y_train)
predicted = svm_text.predict(X_test_text)
print('metrics on changed_text')
print(metrics.classification_report(y_test, predicted))

metrics on changed_text
              precision    recall  f1-score   support

        20.0       0.43      0.35      0.39        17
        29.0       0.28      0.42      0.33        12
        40.0       0.56      0.42      0.48        24
        43.0       0.43      0.29      0.34        21
        57.0       0.42      0.29      0.34        28
        60.0       0.35      0.39      0.37        31
        65.0       0.37      0.38      0.37        29
        71.0       0.61      0.43      0.50        63
        80.0       0.55      0.67      0.61        61
        86.0       0.61      0.65      0.63       102
       100.0       0.63      0.73      0.68        83

    accuracy                           0.54       471
   macro avg       0.48      0.46      0.46       471
weighted avg       0.54      0.54      0.53       471



In [115]:
# посмотрим метрики на text + title
count_vect_title_text = CountVectorizer(max_df=0.7)
X_train_text_title = count_vect_title_text.fit_transform(X_train['text title'])
X_test_text_title = count_vect_title_text.transform(X_test['text title'])

svm_text_title = SGDClassifier()
svm_text_title.fit(X_train_text_title, y_train)
predicted = svm_text_title.predict(X_test_text_title)
print('Title и Text')
print(metrics.classification_report(y_test, predicted))

Title и Text
              precision    recall  f1-score   support

        20.0       0.33      0.35      0.34        17
        29.0       0.57      0.33      0.42        12
        40.0       0.50      0.46      0.48        24
        43.0       0.86      0.29      0.43        21
        57.0       0.30      0.29      0.29        28
        60.0       0.48      0.45      0.47        31
        65.0       0.34      0.34      0.34        29
        71.0       0.49      0.63      0.56        63
        80.0       0.60      0.69      0.64        61
        86.0       0.65      0.59      0.62       102
       100.0       0.70      0.75      0.73        83

    accuracy                           0.56       471
   macro avg       0.53      0.47      0.48       471
weighted avg       0.57      0.56      0.55       471



метрики улучшились (positive,negative, different не успел обработать)

In [116]:
idf_vectorizer = TfidfVectorizer()
X_train_idf_text_title = idf_vectorizer.fit_transform(X_train['text title'])
X_test_idf_text_title = idf_vectorizer.transform(X_test['text title'])

svm_title_text_title = SGDClassifier()
svm_title_text_title.fit(X_train_idf_text_title, y_train)
predicted = svm_title_text_title.predict(X_test_idf_text_title)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

        20.0       0.44      0.41      0.42        17
        29.0       0.36      0.33      0.35        12
        40.0       0.64      0.29      0.40        24
        43.0       0.75      0.43      0.55        21
        57.0       0.30      0.25      0.27        28
        60.0       0.57      0.39      0.46        31
        65.0       0.37      0.34      0.36        29
        71.0       0.58      0.63      0.61        63
        80.0       0.55      0.59      0.57        61
        86.0       0.56      0.68      0.61       102
       100.0       0.65      0.72      0.68        83

    accuracy                           0.55       471
   macro avg       0.52      0.46      0.48       471
weighted avg       0.55      0.55      0.55       471



Лучшие показатели получили при помощи веторизации данных