In [19]:
import time
import csv
import re
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score

class SoftmaxRegression():
    def __init__(self, class_list):
        self.weight = np.array([], dtype=float) # k * n + 1 
        self.k = len(class_list)
        self.index_to_label = {}
        self.label_to_index = {}
        for i, label in enumerate(class_list):
            self.index_to_label[i] = label
            self.label_to_index[label] = i

    def softmax(self, X):
        matrix = np.exp(np.dot(self.weight, X.T)) #softmax activation function for the input data matrix X
        return matrix / np.sum(matrix, axis=0)

    #make predictions using the trained softmax regression model.
    def predict(self, X):
        X = np.array(X) #converts the input data X into an array which is the desired format for further processing
        sample_nums = X.shape[0]
        X = np.column_stack((np.ones(sample_nums), X)) # to incorporate the bias term into the input data
        index_result = np.argmax(self.softmax(X), axis=0) #determines the index of the class with the highest probability for each sample 
        return [self.index_to_label[index] for index in index_result]#maps these indices back to the corresponding class labels


    # trains the softmax regression model by iteratively updating the weights using batch gradient descent, to minimize the loss function and improve prediction.
    def fit_BGD(self, X, y, alpha=0.01, reg=0.1, max_iter=1000, epsilon=1e-10):#gradient descent
        X = np.array(X)
        sample_nums, feature_nums = X.shape[0], X.shape[1] + 1#bias
        Y = np.zeros((self.k, sample_nums))
        for i, label in enumerate(y):
            Y[self.label_to_index[label], i] = 1
        X = np.column_stack((np.ones(sample_nums), X))
        self.weight = np.zeros((self.k, feature_nums), dtype=float)
        for i in range(max_iter):
            batch_gradient = np.dot((Y - self.softmax(X)), X)  / sample_nums
            self.weight += (alpha * batch_gradient - reg * self.weight)# Updates the weight matrix using the computed gradient. The weights are adjusted in the direction that minimizes the loss function, scaled by the learning rate (alpha) and the regularization term (reg). This step is the heart of the Batch Gradient Descent algorithm.
        return self

    def score(self, X, y_true):
        return accuracy_score(y_true, self.predict(X))


TRAINING_NEWS_FILE = 'bbc_news_train.csv'
TEST_FILE = 'bbc_news_test.csv'
NON_WORD_CHAR = re.compile(r'\W+')

def get_csv_data(path):
    data_list = []
    with open(path) as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            data_list.append((tokenize(row[0]), row[1]))
    return data_list

def tokenize(string): 
    true_tokens = []
    stop_words = set(stopwords.words('english'))
    possible_tokens = [NON_WORD_CHAR.sub('', word) for word in string.lower().strip().split() if word not in stop_words]
    for possible in possible_tokens:
        if len(possible) > 0:
            true_tokens.append(possible)
    return true_tokens

def get_feature_words(training_set, size = 1000):
    feature_words_dict = {}
    for element in training_set:
        for word in element[0]:
            if word in feature_words_dict:
                feature_words_dict[word] += 1
            else:
                feature_words_dict[word] = 1
    feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
    feature_words = list(list(zip(*feature_words_tuple))[0])
    return feature_words[:size] if len(feature_words) > size else feature_words

def train_test_extract(training_set, test_data, feature_words):
    X_train = [[1 if word in element[0] else 0 for word in feature_words] for element in training_set]
    y_train = [element[1] for element in training_set]
    X_test = [[1 if word in element[0] else 0 for word in feature_words] for element in test_data]
    y_test = [element[1] for element in test_data]
    return X_train, y_train, X_test, y_test

def get_category_list(training_set):
    local_category_list = set()
    for row in training_set:
        local_category_list.add(row[1])
    category_list = list(local_category_list)
    category_list.sort()
    return category_list

if __name__ == '__main__':
    
    print("Softmax Regression Starts!")
    
    start_time = time.time()
    train_data = get_csv_data(TRAINING_NEWS_FILE)
    test_data = get_csv_data(TEST_FILE)
    feature_words = get_feature_words(train_data, size=1000)
    X_train, y_train, X_test, y_test = train_test_extract(train_data, test_data, feature_words)
    category_list = get_category_list(train_data)
    print("Loading Time: %ss." % str(time.time()-start_time))

    start_time = time.time()
    clf_BGD = SoftmaxRegression(category_list).fit_BGD(X_train, y_train, alpha=0.1, reg=0.01, max_iter=2000, epsilon=0.0)
    print("Training Time:  %ss" % (str(time.time()-start_time)))

    start_time = time.time()
    test_accuracy = clf_BGD.score(X_test, y_test)
    print("Testing Time: %ss" % (str(time.time()-start_time)))

    print("Accuraccy: %s" % str(test_accuracy))
    
    print("Softmax Regression Ends!")

Softmax Regression Starts!
Loading Time: 4.706471920013428s.
Training Time:  37.34835600852966s
Testing Time: 0.11655998229980469s
Accuraccy: 0.9730337078651685
Softmax Regression Ends!
