In [12]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [13]:
data = pd.read_csv('news.csv')
data

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0
...,...,...
16985,KfW credit line for Uniper could be raised to ...,3
16986,KfW credit line for Uniper could be raised to ...,3
16987,Russian https://t.co/R0iPhyo5p7 sells 1 bln r...,3
16988,Global ESG bond issuance posts H1 dip as supra...,3


In [14]:
class_counts = data['label'].value_counts()
min_samples = min(class_counts)
data_subset = data.groupby('label').apply(lambda x: x.sample(min_samples)).reset_index(drop=True)

In [15]:
def preprocess_text(text):
    text = text.lower()
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [16]:
preprocessed_text = [preprocess_text(document) for document in data_subset['text']]

In [17]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_text)
y = data_subset['label']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
nb_classifier = MultinomialNB(alpha=1)
nb_cv_scores = cross_val_score(nb_classifier, X_train, y_train, cv=5)
nb_cv_mean_score = np.mean(nb_cv_scores)
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)

In [20]:
nb_results = pd.DataFrame({'Actual': y_test, 'Predicted': nb_predictions})
print("Naive Bayes Results: ")
print(nb_results)

Naive Bayes Results: 
     Actual  Predicted
150       3          3
406       9          9
513      11         11
101       2         17
535      12         12
..      ...        ...
319       7          7
362       8          8
367       8          8
264       6          6
320       7          7

[176 rows x 2 columns]


In [21]:
print("Naive Bayes Accuracy: ", nb_accuracy)

Naive Bayes Accuracy:  0.7215909090909091


In [23]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

In [24]:
print("Random Forest Accuracy (without GridSearchCV): ", rf_accuracy)

Random Forest Accuracy (without GridSearchCV):  0.7045454545454546


In [25]:
param_grid = {
    'n_estimators': [100,200,300,400],
    'max_depth': [None, 10, 20, 30],
    
}

NameError: name 'best_params' is not defined

In [None]:
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
grid_rf_predictions = grid_search.predict(X_test)
grid_rf_accuracy = accuracy_score(y_test, grid_rf_predictions)

In [None]:
print("Best Parameters: ", best_params)

In [None]:
grid_rf_results = pd.DataFrame({'Actual': y_test, 'Predicted': grid_rf_predictions})
print("Random Forest Results (with GridSearchCV): ")
print(grid_rf_results)

In [None]:
best_score = grid_search.best_score_
print("Best Score: ", best_score)