**20 newsgroups' dataset**

In [None]:
from sklearn.datasets import fetch_20newsgroups
X_test, y_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), return_X_y=True)
X_train, y_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), return_X_y=True)

**Reuter's dataset**


*   Loading the data and parsing it
*   All the numbers/dates can be removed, since they are not important
*   The punctuation is also being filtered out by regex






In [2]:
!wget -N http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
!tar zxf reuters21578.tar.gz
!ls

--2021-05-04 20:07:24--  http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘reuters21578.tar.gz’ not modified on server. Omitting download.

all-exchanges-strings.lc.txt	    reut2-002.sgm  reut2-013.sgm
all-orgs-strings.lc.txt		    reut2-003.sgm  reut2-014.sgm
all-people-strings.lc.txt	    reut2-004.sgm  reut2-015.sgm
all-places-strings.lc.txt	    reut2-005.sgm  reut2-016.sgm
all-topics-strings.lc.txt	    reut2-006.sgm  reut2-017.sgm
cat-descriptions_120396.txt	    reut2-007.sgm  reut2-018.sgm
feldman-cia-worldfactbook-data.txt  reut2-008.sgm  reut2-019.sgm
lewis.dtd			    reut2-009.sgm  reut2-020.sgm
README.txt			    reut2-010.sgm  reut2-021.sgm
reut2-000.sgm			    reut2-011.sgm  reuters21578.tar.gz
reut2-001.sgm			    reut2-012.sgm  sample_data


In [3]:
#loading the data
import re
import os
from bs4 import BeautifulSoup

cwd = '/content/'
files = os.listdir(cwd)

raw_y = []
raw_X = []

for file in files:
  if file.startswith("reut2-") and file.endswith(".sgm"):

    with open(file, 'rb') as f:

      print('Reading file: %s' % file)

      soup = BeautifulSoup(f, "html.parser")  #"lxml"
      articles = soup.find_all('reuters')

      for article in articles:
        topics = article.findAll('topics')
        bodies = article.findAll('text')

        if len(topics[0]) == 0:
          topics_list = ''  #no topic
        else:
          topics_list = []
          for topic in topics[0]:  #getting rid of the tags 
            topics_list.append((topic.text).lower())
        raw_y.append(topics_list)
        
        if len(bodies) == 0:
          raw_X.append('')
        else:
          body = (bodies[0].text).lower()
          body = re.sub('[^A-Za-z0-9]+', ' ', body) #getting rid of special char
          body = re.sub(r'\d+','nn', body)  #getting rid of numbers
          raw_X.append(body)

Reading file: reut2-014.sgm
Reading file: reut2-011.sgm
Reading file: reut2-019.sgm
Reading file: reut2-020.sgm
Reading file: reut2-015.sgm
Reading file: reut2-006.sgm
Reading file: reut2-017.sgm
Reading file: reut2-009.sgm
Reading file: reut2-001.sgm
Reading file: reut2-021.sgm
Reading file: reut2-012.sgm
Reading file: reut2-008.sgm
Reading file: reut2-005.sgm
Reading file: reut2-004.sgm
Reading file: reut2-000.sgm
Reading file: reut2-007.sgm
Reading file: reut2-016.sgm
Reading file: reut2-010.sgm
Reading file: reut2-018.sgm
Reading file: reut2-002.sgm
Reading file: reut2-003.sgm
Reading file: reut2-013.sgm


Vectorizer 


*   stop words are being removed by the vectorizer
*   3-gram tokens

We also need a Multi-Label attribute, since an article can have more than one topic. Therefore I'm taking multiLabelBinarizer for our topics.



In [4]:
import numpy as np
import time
import sklearn
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB

In [5]:
# Splitting the data X and y into training and testing sets with the testing size
X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, test_size = 0.2)

In [6]:
#vectorizer = CountVectorizer(analyzer='word',stop_words="english", ngram_range=(1, 3))
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
#print(vectorizer.get_feature_names()[0:100])

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer  
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train) 
y_test = mlb.transform(y_test) 

In [8]:
assert(X_train.shape[0]==y_train.shape[0])    #to make sure dataset is loaded correctly
assert(X_test.shape[0]==y_test.shape[0])
print(f'Number of Training Samples: {y_train.shape[0]}')
print(f'Number of Test Samples: {y_test.shape[0]}')

Number of Training Samples: 17262
Number of Test Samples: 4316


# Non-probabilistic classifier



1.   Trying to find the best hyperparameters (before running the first kernel, take into consideration that the code will run for at least 40 minutes).
2.   Training and testing with the best hyperparameters (parameters are taken from the sklearn documentation).





In [9]:
import pandas as pd
import warnings

In [10]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier


#parameters
loss_parameters = ['hinge', 'modified_huber', 'squared_hinge', 'perceptron']
penalty_parameters = ['l2', 'l1', 'elasticnet']
alpha_parameters = [1e-5, 0.0001, 0.001, 0.01]

#dataframe
results = []

#testing the best parameters 
for l in loss_parameters:
  for p in penalty_parameters:
    for a in alpha_parameters: 
      with warnings.catch_warnings():
        warnings.simplefilter("ignore")  #ignoring UserWarning 

        # Classifier
        classifier = OneVsRestClassifier(SGDClassifier(alpha=a, penalty=p, loss=l)) 

        # train
        classifier.fit(X_train, y_train)

        print(f'Training with hyperparameters: alpha [{a}], loss [{l}], penalty [{p}]')

        #predict
        predictions = classifier.predict(X_test)

        #accuracy
        accuracy = accuracy_score(y_test, predictions)*100


        #dataframe
        results.append({
            'Alpha': a,
            'Penalty': p,
            'Loss': l,
            'Accuracy': round(accuracy, 3)
              })

Training with hyperparameters: alpha [1e-05], loss [hinge], penalty [l2]
Training with hyperparameters: alpha [0.0001], loss [hinge], penalty [l2]
Training with hyperparameters: alpha [0.001], loss [hinge], penalty [l2]
Training with hyperparameters: alpha [0.01], loss [hinge], penalty [l2]
Training with hyperparameters: alpha [1e-05], loss [hinge], penalty [l1]
Training with hyperparameters: alpha [0.0001], loss [hinge], penalty [l1]
Training with hyperparameters: alpha [0.001], loss [hinge], penalty [l1]
Training with hyperparameters: alpha [0.01], loss [hinge], penalty [l1]
Training with hyperparameters: alpha [1e-05], loss [hinge], penalty [elasticnet]
Training with hyperparameters: alpha [0.0001], loss [hinge], penalty [elasticnet]
Training with hyperparameters: alpha [0.001], loss [hinge], penalty [elasticnet]
Training with hyperparameters: alpha [0.01], loss [hinge], penalty [elasticnet]
Training with hyperparameters: alpha [1e-05], loss [modified_huber], penalty [l2]
Training w

In [12]:
#creating a pandas dataframe and sorting by accuracy      
results = pd.DataFrame(results)
results = results.sort_values(by='Accuracy', ascending=False)
print(results)

      Alpha     Penalty            Loss  Accuracy
8   0.00001  elasticnet           hinge    84.615
0   0.00001          l2           hinge    84.222
20  0.00001  elasticnet  modified_huber    83.851
4   0.00001          l1           hinge    83.318
12  0.00001          l2  modified_huber    83.040
13  0.00010          l2  modified_huber    82.901
21  0.00010  elasticnet  modified_huber    82.298
36  0.00001          l2      perceptron    81.974
17  0.00010          l1  modified_huber    81.163
1   0.00010          l2           hinge    80.561
37  0.00010          l2      perceptron    80.329
38  0.00100          l2      perceptron    79.819
16  0.00001          l1  modified_huber    79.518
25  0.00010          l2   squared_hinge    79.147
33  0.00010  elasticnet   squared_hinge    78.313
44  0.00001  elasticnet      perceptron    78.035
9   0.00010  elasticnet           hinge    77.386
5   0.00010          l1           hinge    77.317
29  0.00010          l1   squared_hinge    77.132


In [13]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
start_time = time.time()


# Classifier
classifier = OneVsRestClassifier(SGDClassifier(alpha=results.iloc[0]['Alpha'], penalty=results.iloc[0]['Penalty'], loss=results.iloc[0]['Loss'])) 

# train
classifier.fit(X_train, y_train)
#predict
predictions = classifier.predict(X_test)

#accuracy & time
print("Total runtime %.2f seconds." % (time.time() - start_time))
print(f"Test accuracy is {results.iloc[0]['Accuracy']}.")

Total runtime 75.26 seconds.
Test accuracy is 84.615.


**Evaluation**

In [14]:
#Evaluating on the trained data
predictions2 = classifier.predict(X_train)
y_pred = mlb.inverse_transform(predictions2)
print(round(accuracy_score(y_train,predictions2), 3)*100)

98.9


In [15]:
#Evaluating on the test data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print(f"Test accuracy is {results.iloc[0]['Accuracy']}.")
print("==========")

precision_micro = precision_score(y_test, predictions,
                            average='micro', labels=np.unique(predictions)) 
recall_micro = recall_score(y_test, predictions,
                      average='micro', labels=np.unique(predictions))
f1_micro = f1_score(y_test, predictions, average='micro', labels=np.unique(predictions))
 
print("Micro-average:")
print(f'Precision: {round(precision_micro,3)}, Recall: {round(recall_micro,3)},\
 F1-measure: {round(f1_micro,3)}')
print("==========")
 
precision_macro = precision_score(y_test, predictions,
                             average='macro',labels=np.unique(predictions))
recall_macro = recall_score(y_test, predictions,
                      average='macro', labels=np.unique(predictions))
f1_macro = f1_score(y_test, predictions, average='macro', labels=np.unique(predictions))
 
print("Macro-average:")
print(f'Precision: {round(precision_macro,3)}, Recall: {round(recall_macro,3)},F1-measure: {round(f1_macro,3)}')



Test accuracy is 84.615.
Micro-average:
Precision: 0.865, Recall: 0.879, F1-measure: 0.871
Macro-average:
Precision: 0.87, Recall: 0.852,F1-measure: 0.86


# Probabilistic classifier

In [16]:
import pandas as pd
import warnings

In [17]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

#looking for the best hyperparameters
alpha_parameters = [1e-05, 0.0001, 0.001, 0.01, 0.1, 1]

results = []

for alpha in alpha_parameters:
  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    classifier = OneVsRestClassifier(MultinomialNB(alpha=alpha))

    # train
    classifier.fit(X_train, y_train)
    print(f'Training with hyperparameters: alpha [{alpha}]')
    
    #predict
    predictions = classifier.predict(X_test)

    #accuracy
    accuracy = accuracy_score(y_test, predictions)*100

    #dataframe
    results.append({
            'Alpha': alpha,
            'Accuracy': round(accuracy, 3)
              })


Training with hyperparameters: alpha [1e-05]
Training with hyperparameters: alpha [0.0001]
Training with hyperparameters: alpha [0.001]
Training with hyperparameters: alpha [0.01]
Training with hyperparameters: alpha [0.1]
Training with hyperparameters: alpha [1]


In [18]:
#creating a pandas dataframe and sorting by accuracy      
results = pd.DataFrame(results)
results = results.sort_values(by='Accuracy', ascending=False)
print(results)

     Alpha  Accuracy
2  0.00100    77.363
1  0.00010    77.294
0  0.00001    77.201
3  0.01000    76.622
4  0.10000    66.311
5  1.00000    61.098


In [20]:
#training and testing with the best alpha

classifier = OneVsRestClassifier(MultinomialNB(alpha=results.iloc[0]['Alpha']))

# train
classifier.fit(X_train, y_train)
#predict
predictions = classifier.predict(X_test)

# accuracy
print ("Total runtime %.2f seconds." % (time.time() - start_time))
print(f"Test accuracy is {results.iloc[0]['Accuracy']}")

Total runtime 434.72 seconds.
Test accuracy is 77.363


**Evaluation**

In [21]:
#Evaluating on the trained data
predictions2 = classifier.predict(X_train)
y_pred = mlb.inverse_transform(predictions2)
print(round(accuracy_score(y_train,predictions2), 3)*100)

94.8


In [23]:
#Evaluating on the test data
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print(f"Test accuracy is {results.iloc[0]['Accuracy']}.")
print("==========")

precision_micro = precision_score(y_test, predictions,
                            average='micro', labels=np.unique(predictions)) 
recall_micro = recall_score(y_test, predictions,
                      average='micro', labels=np.unique(predictions))
f1_micro = f1_score(y_test, predictions, average='micro', labels=np.unique(predictions))
 
print("Micro-average:")
print(f'Precision: {round(precision_micro,3)}, Recall: {round(recall_micro,3)},\
 F1-measure: {round(f1_micro,3)}')
print("==========")
 
precision_macro = precision_score(y_test, predictions,
                             average='macro',labels=np.unique(predictions))
recall_macro = recall_score(y_test, predictions,
                      average='macro', labels=np.unique(predictions))
f1_macro = f1_score(y_test, predictions, average='macro', labels=np.unique(predictions))
 
print("Macro-average:")
print(f'Precision: {round(precision_macro,3)}, Recall: {round(recall_macro,3)},F1-measure: {round(f1_macro,3)}')

Test accuracy is 77.363.
Micro-average:
Precision: 0.795, Recall: 0.777, F1-measure: 0.786
Macro-average:
Precision: 0.854, Recall: 0.715,F1-measure: 0.773
