# Рубежный контроль №2

### Тема: Методы обработки текстов.
### Решение задачи классификации текстов.

Необходимо решить задачу классификации текстов на основе любого выбранного Вами датасета (кроме примера, который рассматривался в лекции). Классификация может быть бинарной или многоклассовой. Целевой признак из выбранного Вами датасета может иметь любой физический смысл, примером является задача анализа тональности текста.

Необходимо сформировать два варианта векторизации признаков - на основе CountVectorizer и на основе TfidfVectorizer.

ИУ5-23М: LinearSVC, LogisticRegression

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import time

In [32]:
# Загрузка данных
df = pd.read_csv('cherry_blossom_forecasts.csv')

In [33]:
df.head(10)

Unnamed: 0,place_code,date,mankai_date,kaika_date,meter,tavg,tmin,tmax,prcp
0,1370053,2024-02-01,2024-05-12,2024-05-07,12,-7.8,-11.4,-5.3,4.1
1,1370022,2024-02-01,2024-05-03,2024-04-29,12,-4.2,-7.6,0.5,0.3
2,1370024,2024-02-01,2024-05-09,2024-05-04,10,-4.2,-7.6,0.5,0.3
3,1370028,2024-02-01,2024-05-10,2024-05-04,13,-5.6,-6.8,-4.5,0.0
4,1370029,2024-02-01,2024-05-09,2024-05-06,13,,,,
5,1370030,2024-02-01,2024-05-15,2024-05-09,12,-7.1,-7.8,-1.1,3.0
6,1370032,2024-02-01,2024-05-10,2024-05-05,11,-5.2,-8.0,0.0,2.6
7,1370041,2024-02-01,2024-05-13,2024-05-05,12,-2.6,-4.9,4.0,5.0
8,1370046,2024-02-01,2024-05-12,2024-05-07,11,,,,
9,1370055,2024-02-01,2024-05-09,2024-05-03,11,,,,


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85171 entries, 0 to 85170
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   place_code   85171 non-null  int64  
 1   date         85171 non-null  object 
 2   mankai_date  85171 non-null  object 
 3   kaika_date   85171 non-null  object 
 4   meter        85171 non-null  int64  
 5   tavg         75964 non-null  float64
 6   tmin         75964 non-null  float64
 7   tmax         75964 non-null  float64
 8   prcp         73766 non-null  float64
dtypes: float64(4), int64(2), object(3)
memory usage: 5.8+ MB


In [35]:
# проверим пропуски в данных и устраним их
na_mask = df.isna()
na_counts = na_mask.sum()
na_counts

place_code         0
date               0
mankai_date        0
kaika_date         0
meter              0
tavg            9207
tmin            9207
tmax            9207
prcp           11405
dtype: int64

In [36]:
df.dropna(inplace=True)
na_mask = df.isna()
na_counts = na_mask.sum()
na_counts

place_code     0
date           0
mankai_date    0
kaika_date     0
meter          0
tavg           0
tmin           0
tmax           0
prcp           0
dtype: int64

In [38]:
# Разделим набор данных на обучающую и тестувую выборки
X, Y = df['mankai_date'], df['kaika_date']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

time_arr = []

In [39]:
# векторизация признаков с помощью CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

In [40]:
# векторизация признаков с помощью TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [41]:
# Произведем обучения вдух классификаторов (по варианту) для CountVectorizer

# LinearSVC
gbc = LinearSVC()
start_time = time.time()
gbc.fit(X_train_counts, y_train)
train_time = time.time() - start_time
time_arr.append(train_time)
pred_gbc_counts = gbc.predict(X_test_counts)
print("Точность (CountVectorizer + LinearSVC):", accuracy_score(y_test, pred_gbc_counts))

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
start_time = time.time()
lr.fit(X_train_counts, y_train)
train_time = time.time() - start_time
time_arr.append(train_time)
pred_lr_counts = lr.predict(X_test_counts)
print("Точность (CountVectorizer + LogisticRegression):", accuracy_score(y_test, pred_lr_counts))

Точность (CountVectorizer + LinearSVC): 0.27104514030093535
Точность (CountVectorizer + LogisticRegression): 0.2721295919750576


In [42]:
# Произведем обучения вдух классификаторов (по варианту) для TfidfVectorizer

# LinearSVC
gbc = LinearSVC()
start_time = time.time()
gbc.fit(X_train_tfidf, y_train)
train_time = time.time() - start_time
time_arr.append(train_time)
pred_gbc_tfidf = gbc.predict(X_test_tfidf)
print("Точность (TfidfVectorizer + LinearSVC):", accuracy_score(y_test, pred_gbc_tfidf))

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
start_time = time.time()
lr.fit(X_train_tfidf, y_train)
train_time = time.time() - start_time
time_arr.append(train_time)
pred_lr_tfidf = lr.predict(X_test_tfidf)
print("Точность (TfidfVectorizer + LogisticRegression):", accuracy_score(y_test, pred_lr_tfidf))

Точность (TfidfVectorizer + LinearSVC): 0.27104514030093535
Точность (TfidfVectorizer + LogisticRegression): 0.27057069269350686


In [43]:
from tabulate import tabulate

data = [
    ["(CountVectorizer + LogisticRegression)", accuracy_score(y_test, pred_lr_counts), time_arr[0]],
    ["(CountVectorizer + LinearSVC)", accuracy_score(y_test, pred_gbc_counts), time_arr[1]],
    ["(TfidfVectorizer + LogisticRegression)", accuracy_score(y_test, pred_lr_tfidf), time_arr[2]],
    ["(TfidfVectorizer + LinearSVC)", accuracy_score(y_test, pred_gbc_tfidf), time_arr[3]]
]

sorted_data = sorted(data, key=lambda x: x[1], reverse=True)

# Вывод отсортированных данных в виде таблицы
print(tabulate(sorted_data, ['Связка','Точность валидации', 'Время обучения'], tablefmt="grid"))

+----------------------------------------+----------------------+------------------+
| Связка                                 |   Точность валидации |   Время обучения |
| (CountVectorizer + LogisticRegression) |             0.27213  |         15.4454  |
+----------------------------------------+----------------------+------------------+
| (CountVectorizer + LinearSVC)          |             0.271045 |         80.2609  |
+----------------------------------------+----------------------+------------------+
| (TfidfVectorizer + LinearSVC)          |             0.271045 |         44.3085  |
+----------------------------------------+----------------------+------------------+
| (TfidfVectorizer + LogisticRegression) |             0.270571 |          6.57759 |
+----------------------------------------+----------------------+------------------+
