# Naive Bayes for text classification in Python

https://www.annytab.com/naive-bayes-for-text-classification-in-python/

### Preprocess data

In [93]:
# Import Libraries
import re
import string
import nltk.stem

# Download WordNetLemmatizer
#nltk.download()


# Variabes
QUOTES = re.compile(r'(writes in|writes:|wrote:|says:|said:|^In article|^Quoted from|^\||^>)')

# Preprocess data

def preprocess_data(data):
    
    # Create a stemmer/lemmatizer
    stemmer = nltk.stem.SnowballStemmer('english')
    #lemmer = nltk.stem.WordNetLemmatizer()
    
    for i in range(len(data)):
        
        # Remove header
        _, _, data[i] = data[i].partition('\n\n')
        
        
        # Remove footer
        lines = data[i].strip().split('\n')
        for line_num in range(len(lines) - 1, -1, -1):
            line = lines[line_num]
            if line.strip().strip('-') == '':
                break
        if line_num > 0:
            data[i] = '\n'.join(lines[:line_num])
        
        
        # Remove quotes
        data[i] = '\n'.join([line for line in data[i].split('\n') if not QUOTES.search(line)])
        
        # Remove punctuation (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)
        data[i] = data[i].translate(str.maketrans('', '', string.punctuation))
        
        # Remove digits
        data[i] = re.sub('\d', '', data[i])
        
        
        # Stem words
        data[i] = ' '.join([stemmer.stem(word) for word in data[i].split()])
        #data[i] = ' '.join([lemmer.lemmatize(word) for word in data[i].split()])
    
    # Return data
    return data
        

### Training Module

In [135]:

# Import Libraries
import joblib
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
import sklearn.feature_extraction.text
import sklearn.naive_bayes
import sklearn.metrics
import sklearn.model_selection
import sklearn.pipeline

import common


# Visualize dataset

def visualize_dataset(ds):
    
    # Print dataset
    
    #for i in range(5):
    #    print(ds.data[i])
    #print(ds.target_names)
    
    print('--- Information ---')
    print('Number of articles: ' + str(len(ds.data)))
    print('Number of categories: ' + str(len(ds.target_names)))
    
    
    # count number of articles in each category
    plot_X = np.arange(20, dtype=np.int16)
    plot_Y = np.zeros(20)
    for i in range(len(ds.data)):
        plot_Y[ds.target[i]] += 1
        
    print('\n--- Class Distribution ---')
    for i in range(len(plot_X)):
        print('{0}: {1:.0f}'.format(ds.target_names[plot_X[i]], plot_Y[i]))

    # Plot the balance of the dataset

    figure = plt.figure(figsize = (16, 10))
    figure.suptitle('Balance of data set', fontsize=16)
    plt.bar(plot_X, plot_Y, align='center', color='rgbkymc')
    plt.xticks(plot_X, ds.target_names, rotation=25, horizontalalignment='right')
    plt.show()
    plt.savefig('C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\20NewsGroupsBalance.png')



# Perform a grid search to find the best hyperparameters

def grid_search(train):
    
    # Create a pipeline
    clf_pipeline = sklearn.pipeline.Pipeline([
        ('v', sklearn.feature_extraction.text.CountVectorizer(strip_accents='ascii', stop_words='english')),
        ('t', sklearn.feature_extraction.text.TfidfTransformer()),
        ('c', sklearn.naive_bayes.MultinomialNB(fit_prior=True, class_prior=None))
    ])
    
    
    # Set parameters (name in pipeline + name of parameter)
    parameters = {
        'v__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
        'v__lowercase': (True, False),
        't__use_idf': (True, False),
        'c__alpha': (0.3, 0.6, 1.0)}
    
    
    # Create a grid search classifier
    gs_classifier = sklearn.model_selection.GridSearchCV(clf_pipeline, parameters, cv=5, iid=False, n_jobs=2, scoring='accuracy', verbose=1)
    
     # Start a search (Warning: takes a long time if the whole dataset is used)
    # Slice: (train.data[:4000], train.target[:4000])
    gs_classifier = gs_classifier.fit(train.data, train.target)
    
    
    # Print results
    print('--- Results---')
    print('Best Score: ' + str(gs_classifier.best_score_))
    for name in sorted(parameters.keys()):
        print('{0}: {1}'.format(name, gs_classifier.best_params_[name]))
    

    

# Train and evaluate a model

def train_and_evaluate(train):
    
    # Convert to bag of words
    count_vect = sklearn.feature_extraction.text.CountVectorizer(strip_accents='ascii', stop_words='english', lowercase=True, ngram_range=(1,1))
    X = count_vect.fit_transform(train.data)
    
    
    # Convert from occurrences to frequencies
    # Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.
    # To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.
    transformer = sklearn.feature_extraction.text.TfidfTransformer()
    X = transformer.fit_transform(X)
    
    
    # Create a model
    model = sklearn.naive_bayes.MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None)
    
    # Train the model
    model.fit(X, train.target)
    
    # Save models
    joblib.dump(count_vect, 'C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\vectorizer.jbl') 
    joblib.dump(transformer,'C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\transformer.jbl')
    joblib.dump(model, 'C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\model.jbl')
    
    
    # Evaluate on training data
    print('-- Training data --')
    predictions = model.predict(X)
    
    accuracy = sklearn.metrics.accuracy_score(train.target, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    
    print('Classification Report:')
    print(sklearn.metrics.classification_report(train.target, predictions, target_names=train.target_names))
    print('')
    
    # Evaluate with 10-fold CV
    print('-- 10-fold CV --')
    predictions = sklearn.model_selection.cross_val_predict(model, X, train.target, cv=10)
    
    accuracy = sklearn.metrics.accuracy_score(train.target, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(sklearn.metrics.classification_report(train.target, predictions, target_names=train.target_names))

    
    
# The main entry point for this module

def main():
    
    # Load train dataset
    # Load text files with categories as subfolder names
    # Individual samples are assumed to be files stored a two levels folder structure
    # The folder names are used as supervised signal label names. The individual file names are not important.
    
    train = sklearn.datasets.load_files('C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\20news-bydate\\20news-bydate-train', shuffle=False, load_content=True, encoding='latin1')
     
    # Visualize dataset
    #visualize_dataset(train)
    
    # Preprocess data
    train.data = preprocess_data(train.data)
    
    # Print cleaned data
    #print(train.data[0])
    
     # Grid search
    grid_search(train)
    
     # Train and evaluate
    train_and_evaluate(train)
    

# Tell python to run main method
if __name__ == "__main__": main()

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  2.1min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  9.7min
[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed: 12.0min finished


--- Results---
Best Score: 0.7083264666317374
c__alpha: 0.3
t__use_idf: True
v__lowercase: True
v__ngram_range: (1, 1)
-- Training data --
Accuracy: 89.37
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.95      0.74      0.83       480
           comp.graphics       0.93      0.89      0.91       584
 comp.os.ms-windows.misc       0.92      0.87      0.89       591
comp.sys.ibm.pc.hardware       0.83      0.93      0.88       590
   comp.sys.mac.hardware       0.96      0.89      0.92       578
          comp.windows.x       0.94      0.96      0.95       593
            misc.forsale       0.96      0.88      0.91       585
               rec.autos       0.95      0.88      0.92       594
         rec.motorcycles       0.98      0.93      0.96       598
      rec.sport.baseball       0.99      0.93      0.96       597
        rec.sport.hockey       0.65      0.97      0.78       600
               sci.crypt     

## Evaluation module

In [138]:
# Import libraries
import joblib
import numpy as np
import sklearn.datasets
import sklearn.feature_extraction.text
import sklearn.naive_bayes
import sklearn.metrics

import common



# Test and evaluate a model

def test_and_evaluate(test):
    
    # Save models
    vectorizer = joblib.load('C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\vectorizer.jbl') 
    transformer = joblib.load('C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\transformer.jbl')
    model = joblib.load('C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\model.jbl')
    
    # Convert to bag of words
    X = vectorizer.transform(test.data)
    
    # Convert from occurrences to frequencies
    X = transformer.transform(X)
    
    # Make predictions
    predictions = model.predict(X)
    
    # Print results
    print('-- Results --')
    accuracy = sklearn.metrics.accuracy_score(test.target, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(sklearn.metrics.classification_report(test.target, predictions, target_names=test.target_names))



# The main entry point for this module
def main():
    
    
    # Load test dataset
    # Load text files with categories as subfolder names
    # Individual samples are assumed to be files stored a two levels folder structure
    # The folder names are used as supervised signal label names. The individual file names are not important.

    test = sklearn.datasets.load_files(r"C:\\Users\\User\\Jupyter Notebooks\\NBNewsGroups\\20news-bydate\20news-bydate-test", shuffle=False, load_content=True, encoding='latin1')
    
    # Preprocess data
    test.data = preprocess_data(test.data)
    
    # Print cleaned data
    #print(test.data[0])
    
    # Test and evaluate
    test_and_evaluate(test)
    
# Tell python to run main method
if __name__ == "__main__": main()
 

-- Results --
Accuracy: 67.83
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.75      0.24      0.36       319
           comp.graphics       0.66      0.66      0.66       389
 comp.os.ms-windows.misc       0.72      0.54      0.62       394
comp.sys.ibm.pc.hardware       0.59      0.72      0.65       392
   comp.sys.mac.hardware       0.75      0.68      0.71       385
          comp.windows.x       0.80      0.76      0.78       395
            misc.forsale       0.82      0.68      0.74       390
               rec.autos       0.83      0.74      0.78       396
         rec.motorcycles       0.83      0.73      0.78       398
      rec.sport.baseball       0.94      0.81      0.87       397
        rec.sport.hockey       0.59      0.94      0.72       399
               sci.crypt       0.60      0.80      0.69       396
         sci.electronics       0.69      0.55      0.61       393
                 sci.m