# Варіант 11. Файл news.csv. В якості текстової моделі використати модель «Сумка слів». Виконати класифікацію за допомогою алгоритмів наївний байєсів класифікатор та випадкові ліси, порівняти їх точність. Спробувати покращити модель випадкові ліси за допомогою GridSearchCV.

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv('news.csv')
data

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0
...,...,...
16985,KfW credit line for Uniper could be raised to ...,3
16986,KfW credit line for Uniper could be raised to ...,3
16987,Russian https://t.co/R0iPhyo5p7 sells 1 bln r...,3
16988,Global ESG bond issuance posts H1 dip as supra...,3


In [4]:
class_counts = data['label'].value_counts()
min_samples = min(class_counts)
data_subset = data.groupby('label').apply(lambda x: x.sample(min_samples)).reset_index(drop=True)

In [5]:
def preprocess_text(text):
    text = text.lower()
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [6]:
preprocessed_text = [preprocess_text(document) for document in data_subset['text']]

In [7]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_text)
y = data_subset['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
nb_classifier = MultinomialNB(alpha=1)
nb_cv_scores = cross_val_score(nb_classifier, X_train, y_train, cv=5)
nb_cv_mean_score = np.mean(nb_cv_scores)
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)

In [10]:
nb_results = pd.DataFrame({'Actual': y_test, 'Predicted': nb_predictions})
print("Naive Bayes Results: ")
print(nb_results)

Naive Bayes Results: 
     Actual  Predicted
150       3          3
406       9          8
513      11         11
101       2          2
535      12         12
..      ...        ...
319       7          7
362       8          8
367       8          8
264       6         14
320       7          7

[176 rows x 2 columns]


In [11]:
print("Naive Bayes Accuracy: ", nb_accuracy)

Naive Bayes Accuracy:  0.6647727272727273


In [12]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

In [13]:
print("Random Forest Accuracy (without GridSearchCV): ", rf_accuracy)

Random Forest Accuracy (without GridSearchCV):  0.5795454545454546


In [14]:
param_grid = {
    'n_estimators': [100,200,300,400],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1, 2, 4],
}

In [15]:
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
grid_rf_predictions = grid_search.predict(X_test)
grid_rf_accuracy = accuracy_score(y_test, grid_rf_predictions)

In [16]:
print("Random Forest Accuracy (with GridSearchCV): ", grid_rf_accuracy)

Random Forest Accuracy (with GridSearchCV):  0.6193181818181818


In [17]:
best_params = grid_search.best_params_
print("Best Parameters: ", best_params)

Best Parameters:  {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


In [18]:
grid_rf_results = pd.DataFrame({'Actual': y_test, 'Predicted': grid_rf_predictions})
print("Random Forest Results (with GridSearchCV): ")
print(grid_rf_results)

Random Forest Results (with GridSearchCV): 
     Actual  Predicted
150       3          3
406       9         18
513      11         11
101       2         12
535      12         18
..      ...        ...
319       7          7
362       8          8
367       8          8
264       6          6
320       7          7

[176 rows x 2 columns]


In [19]:
best_score = grid_search.best_score_
print("Best Score: ", best_score)

Best Score:  0.6435258358662613
