### Импорты

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from catboost import CatBoostClassifier

### Загрузка данных

In [2]:
data = pd.read_csv("../data/singapore_airlines_reviews.csv")
data.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   published_date      10000 non-null  object
 1   published_platform  10000 non-null  object
 2   rating              10000 non-null  int64 
 3   type                10000 non-null  object
 4   text                10000 non-null  object
 5   title               9999 non-null   object
 6   helpful_votes       10000 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 547.0+ KB


In [4]:
data.rating.value_counts()

rating
5    5424
4    1967
1    1057
3    1009
2     543
Name: count, dtype: int64

Заметим, что у нас присутсвует дисбаланс классов. В качестве метрики выберем balanced_accuracy, так как нас не очень сильно интересует обязательно классифицировать именно плохие отзывы или вроде того. Мы просто хотим чтобы в среднем мы отвечали довольно неплохо.

## Task 1 (easy)

In [5]:
X_train_text, X_test_text, y_train, y_test = train_test_split(data.text, data.rating, random_state=42)

In [6]:
bow = CountVectorizer()

X_train = bow.fit_transform(X_train_text)
X_test = bow.transform(X_test_text)

X_train.shape

(7500, 17128)

In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
None

In [8]:
y_pred_train = model.predict(X_train)
score_train = balanced_accuracy_score(y_train, y_pred_train) 
y_pred_test = model.predict(X_test)
score_test = balanced_accuracy_score(y_test, y_pred_test)
print(f"Train score: {score_train}\nTest score: {score_test}")

Train score: 0.976958579361305
Test score: 0.47252291989862644


Получили довольно большой скор на трейне и низкий на тесте... Модель переобучилась и плохо научилась находить закономерности

## Task 2 (medium)

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [10]:
# Удаляем стоп-слова и применяем стемминг

def preprocess_sentence_eng(text):
    stemmer = PorterStemmer()
    out = ' '.join(map(stemmer.stem, re.sub(r"[^\w\s]+", '', text).lower().split()))
    
    try:
        stop_words = stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        stop_words = stopwords.words('english')
    
    for word in stop_words:
        out = out.replace(" " + word + " ", " ")

    return out

In [11]:
data['text_stemmed'] = data['text'].apply(preprocess_sentence_eng)

In [12]:
X_train_text, X_test_text, y_train, y_test = train_test_split(data.text_stemmed, data.rating, random_state=42)

In [13]:
bow = CountVectorizer()

X_train = bow.fit_transform(X_train_text)
X_test = bow.transform(X_test_text)

X_train.shape

(7500, 14919)

In [14]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000), 
    "RandomForestClassifier": RandomForestClassifier(),
    "SVC": SVC(kernel="linear"),
    "CatBoostClassifier": CatBoostClassifier()
    }

out = []

for name, model in models.items():

    model.fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    score_train = balanced_accuracy_score(y_train, y_pred_train) 
    y_pred_test = model.predict(X_test)
    score_test = balanced_accuracy_score(y_test, y_pred_test)
    

    out.append(f"Model: {name}\nTrain score: {score_train}\nTest score: {score_test}\n")

Learning rate set to 0.08769
0:	learn: 1.5354449	total: 108ms	remaining: 1m 47s
1:	learn: 1.4776074	total: 159ms	remaining: 1m 19s
2:	learn: 1.4256476	total: 211ms	remaining: 1m 10s
3:	learn: 1.3877982	total: 261ms	remaining: 1m 4s
4:	learn: 1.3516078	total: 310ms	remaining: 1m 1s
5:	learn: 1.3227072	total: 361ms	remaining: 59.7s
6:	learn: 1.2967126	total: 414ms	remaining: 58.7s
7:	learn: 1.2750932	total: 465ms	remaining: 57.7s
8:	learn: 1.2555081	total: 516ms	remaining: 56.9s
9:	learn: 1.2372766	total: 568ms	remaining: 56.3s
10:	learn: 1.2218017	total: 621ms	remaining: 55.8s
11:	learn: 1.2066576	total: 672ms	remaining: 55.3s
12:	learn: 1.1927228	total: 727ms	remaining: 55.2s
13:	learn: 1.1788831	total: 777ms	remaining: 54.7s
14:	learn: 1.1684052	total: 829ms	remaining: 54.5s
15:	learn: 1.1578505	total: 884ms	remaining: 54.3s
16:	learn: 1.1471807	total: 936ms	remaining: 54.1s
17:	learn: 1.1371706	total: 989ms	remaining: 54s
18:	learn: 1.1276860	total: 1.04s	remaining: 53.9s
19:	learn: 

In [15]:
for result in out:
    print(result)

Model: LogisticRegression
Train score: 0.963515478558483
Test score: 0.4707866584987199

Model: RandomForestClassifier
Train score: 1.0
Test score: 0.3216668664487396

Model: SVC
Train score: 0.9808286445400138
Test score: 0.4650777492110711

Model: CatBoostClassifier
Train score: 0.6257175907188113
Test score: 0.4466263188098831



Взяли четыре различные модели, лучший скор у логистической регрессии. Catboost и SVC дали примерно такие же результаты. Случайный лес с дефолтными параметрами просто переобучился

Так как логистическая регрессия показала лучший скор, подберем ей параметры с помощью кросс-валидации

In [16]:
grid = {
    "solver": ["lbfgs", "liblinear", "saga"],
    'penalty': ["l2", "elasticnet"],
}

In [17]:
model = GridSearchCV(LogisticRegression(max_iter=300), grid)
model.fit(X_train, y_train)
None

15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mitchell/different/hse-ml-course/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mitchell/different/hse-ml-course/venv/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/mitchell/different/hse-ml-course/venv/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/mi

In [18]:
y_pred_train = model.predict(X_train)
score_train = balanced_accuracy_score(y_train, y_pred_train) 
y_pred_test = model.predict(X_test)
score_test = balanced_accuracy_score(y_test, y_pred_test)

print(f"Model: LogisticRegression GridSearchCV\nTrain score: {score_train}\nTest score: {score_test}\n")

Model: LogisticRegression GridSearchCV
Train score: 0.7074320171765425
Test score: 0.48168552326452796



Получилось еще улучшить скор

In [19]:
good_text = "One of the best airline! Fantastic and great!"
bad_text = "One of the worst airline! I am disappointed!"

series = pd.Series([preprocess_sentence_eng(good_text), preprocess_sentence_eng(bad_text)])

model.predict(bow.transform(series))

array([5, 1])

Ура, наша лучшая модель работает на простых отзывах!