In [2]:
import collections
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import opinion_lexicon
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import re
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import warnings

In [3]:
df = pd.read_csv('Datafiniti_Hotel_Reviews.csv')

In [None]:
def clean_text(text):
    cleaned_text = re.sub(r' [^a-zA-Z\s]', '', str(text))
    return cleaned_text

df['cleaned_text'] = df['reviews.text'].apply(clean_text)

print(df['cleaned_text'].head()) 
print(df['reviews.text'].head())

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

In [None]:
df['tokenized_reviews'] = df['cleaned_text'].apply(word_tokenize)

print(df['cleaned_text'].head()) 
print(df['tokenized_reviews'].head())

In [None]:
nlp = spacy.load("en_core_web_sm")

df['lemmatized_text'] = (df['tokenized_reviews'].apply
        (lambda tokens: ' '.join([token.lemma_ for token in nlp(' '.join(tokens))])))

print(df['tokenized_reviews'].head())
print(df['lemmatized_text'].head())

In [None]:
df['stopwords_removed'] = df['lemmatized_text'].apply(lambda x: ' '.join([token.text for token in nlp(x) if token.text.lower() not in STOP_WORDS]))

print(df['lemmatized_text'].head())
print(df['stopwords_removed'].head())

In [None]:
nltk.download('opinion_lexicon')

positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

df['stopwords_removed'] = df['stopwords_removed'].fillna('')

def classify_sentiment(review):
    positive_count = sum(1 for word in positive_words if word in review)
    negative_count = sum(1 for word in negative_words if word in review)
    
    if positive_count > negative_count:
        return 'positive'
    elif positive_count < negative_count:
        return 'negative'
    else:
        return 'neutral'

df['predicted_sentiment'] = df['stopwords_removed'].apply(classify_sentiment)

print(df['stopwords_removed'].head())
print(df['predicted_sentiment'].head())

In [None]:
sentiment_counts = df['predicted_sentiment'].value_counts()

warnings.filterwarnings("ignore", category=FutureWarning)

counts = sentiment_counts.values
labels = sentiment_counts.index

plt.figure(figsize=(8, 6))
sns.barplot(x=labels, y=counts, palette='viridis')
plt.title('Соотношение Нейтральных, Положительных и Отрицательных отзывов')
plt.xlabel('Тональность')
plt.ylabel('Количество отзывов')
plt.show()

In [None]:
sentiment_counts = df['predicted_sentiment'].value_counts()

labels = sentiment_counts.index
sizes = sentiment_counts.values
colors = ['#99ff99','#66b3ff',  '#ff9999']

plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)

centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.axis('equal')

plt.show()

In [None]:
reviews = df['stopwords_removed']

words = []
for review in reviews:
    words += str(review).split()

word_counts = collections.Counter(words)

print(word_counts.most_common(10))

In [14]:
X = df['stopwords_removed']
y = df['predicted_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer(max_features=1000) 
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)

tfidf_vectorizer = TfidfVectorizer(max_features=1000) 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
model_lr = LogisticRegression()
model_lr.fit(X_train_tfidf, y_train)

y_pred = model_lr.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Точность: {accuracy}')

In [None]:
model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)

y_pred_nb = model_nb.predict(X_test_tfidf)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'Точность (Naive Bayes): {accuracy_nb}')

In [None]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_tfidf, y_train)

y_pred_rf = model_rf.predict(X_test_tfidf)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Точность: {accuracy_rf}')

In [None]:
review_to_classify = "Everything is good, except bathroom. Bathroom is bad. But other things are good."

predicted_sentiment = classify_sentiment(review_to_classify)
print(f"Текст отзыва:\n{review_to_classify}\n")
print(f"Предсказанная тональность: {predicted_sentiment}")