In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np

# Read data from the .csv file
data = pd.read_csv('bbc-news-data.csv', sep=r'\t', on_bad_lines='skip')

# Split the data into features (X) and target variable (y)
X = data['content']
y = data['category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Tokenization of text before building the Word2Vec model
tokenized_X_train = [text.split() for text in X_train]

# Building the Word2Vec model
w2v_model = Word2Vec(X_train, vector_size=300, window=5, min_count=1, workers=4)

def document_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        for word in words:
            if word in vocabulary:
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)

# Creating vectors for training and test sets
X_train_word_average = document_vectorizer(X_train, w2v_model, num_features=300)
X_test_word_average = document_vectorizer(X_test, w2v_model, num_features=300)

# Building a Random Forest model
rfc = RandomForestClassifier()
rfc.fit(X_train_word_average, y_train)
rfc_predictions = rfc.predict(X_test_word_average)
rfc_accuracy = accuracy_score(y_test, rfc_predictions)
print("Random Forest Classifier Accuracy:", rfc_accuracy)

# Building a Support Vector Machine model
svm = SVC()
svm.fit(X_train_word_average, y_train)
svm_predictions = svm.predict(X_test_word_average)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("Support Vector Machine Accuracy:", svm_accuracy)

# Tuning models using GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear']}
grid_search_svm = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid_search_svm.fit(X_train_word_average, y_train)
best_svm = grid_search_svm.best_estimator_
best_svm_predictions = best_svm.predict(X_test_word_average)
best_svm_accuracy = accuracy_score(y_test, best_svm_predictions)
print("Best SVM Accuracy after GridSearchCV:", best_svm_accuracy)

# Comparing accuracy before and after tuning
print("SVM Accuracy Improvement:", best_svm_accuracy - svm_accuracy)

# Tuning RandomForest model using GridSearchCV
param_grid = {'n_estimators': [10, 50, 100], 'bootstrap': [True, False]}
grid_search_rfc = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, verbose=2)
grid_search_rfc.fit(X_train_word_average, y_train)
best_rfc = grid_search_rfc.best_estimator_
best_rfc_predictions = best_rfc.predict(X_test_word_average)
best_rfc_accuracy = accuracy_score(y_test, best_rfc_predictions)
print("Best Random Forest Classifier Accuracy after GridSearchCV:", best_rfc_accuracy)

# Comparing accuracy before and after tuning
print("Random Forest Classifier Accuracy Improvement:", best_rfc_accuracy - rfc_accuracy)


  data = pd.read_csv('bbc-news-data.csv', sep=r'\t', on_bad_lines='skip')


Random Forest Classifier Accuracy: 0.7410179640718563
Support Vector Machine Accuracy: 0.5778443113772455
Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.3s
[CV] END .............