In [1]:
#Training and testing models on 20newsgroups dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups #Fetching 20newsgroups

X_train, y_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), return_X_y=True)
X_test, y_test  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), return_X_y=True)

In [3]:
#PROBABILISTIC MODEL

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

#Vectorizing for Multinominal model
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 3))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

#training and predicting labels with probablistic model
classifier = OneVsRestClassifier(MultinomialNB(alpha=0.1))
print("Training the model...")
classifier.fit(X_train, y_train)
print("Predicting labels")
predictions = classifier.predict(X_test)

print('F1 micro:', f1_score(y_test, predictions, average='micro'))
print('Hamming loss:', hamming_loss(y_test, predictions))
print('Accuracy: %s ' %(accuracy_score(predictions, y_test)))

Training the model...
Predicting labels
F1 micro: 0.6771109930961232
Hamming loss: 0.3228890069038768
Accuracy: 0.6771109930961232 


In [4]:
from sklearn.datasets import fetch_20newsgroups #Fetching 20newsgroups

X_train, y_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), return_X_y=True)
X_test, y_test  = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), return_X_y=True)

In [5]:
# NON-PROBABILISTIC MODEL

from sklearn.feature_extraction.text import TfidfVectorizer  #vectorizing for SVC model
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1, 3))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

#training and predicting labels with non-probablistic model
classifier = OneVsRestClassifier(LinearSVC())

print("Training the model...")
classifier.fit(X_train, y_train)
print("Predicting labels")
predictions = classifier.predict(X_test)

print('F1 micro:', f1_score(y_test, predictions, average='micro', labels=np.unique(predictions)))
print('Hamming loss:', hamming_loss(y_test, predictions))
print('Accuracy: %s ' %(accuracy_score(predictions, y_test)))

Training the model...
Predicting labels
F1 micro: 0.7030005310674455
Hamming loss: 0.2969994689325544
Accuracy: 0.7030005310674455 


In [6]:
#Training and testing models on Reuters dataset

In [7]:
!wget -N https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz #Fetching reuters21578
dataset = !tar xvzf reuters21578.tar.gz

--2021-06-11 15:28:31--  https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘reuters21578.tar.gz’ not modified on server. Omitting download.



In [8]:
topicc = []   #Reading the files and creating two lists - with body/titles and topics
contt = []
import re
from bs4 import BeautifulSoup
for f in dataset:
  tags = []
  if f.endswith('sgm'):
    print(f'Processing: {f}')
    lines = []
    for line in open(f, 'rb').readlines():
      line = line.decode('utf-8','ignore')
      lines.append(line)
    data = ' '.join(lines)
    soup = BeautifulSoup(data, 'html.parser')
    contents = soup.findAll('text')
    labels = soup.findAll('topics')      

    cont = []  #cleaning the dataset
    for content in contents:
        content = re.sub(r'\d+',' ', content.text)
        content = re.sub(r'[^\w\s]', ' ', content)
        content = content.replace('Reuter',' ').replace('\x02',' ').replace('\n',' ')
        cont.append(content)  

    topics = []
    for topic in labels:
        topics.append(str(topic))

    top = [] #splitting topics 
    for topic in topics:
        topic = re.findall(r'<topics>(.*?)</topics>', str(topic).strip('\n'))
        if len(topic[0]) != 0:
          topic = re.findall(r'<d>(.*?)</d>', str(topic))
        top.append(topic)

    # conti = cont.copy() #removing articles with no topics  
    # topi = top.copy()
    # for i, topic in enumerate(topi):
    #     if len(topic[0]) == 0:
    #       top.remove(topic)
    #       cont.remove(conti[i])

    topicc += top
    contt += cont          


print(len(contt), len(topicc))

Processing: reut2-000.sgm
Processing: reut2-001.sgm
Processing: reut2-002.sgm
Processing: reut2-003.sgm
Processing: reut2-004.sgm
Processing: reut2-005.sgm
Processing: reut2-006.sgm
Processing: reut2-007.sgm
Processing: reut2-008.sgm
Processing: reut2-009.sgm
Processing: reut2-010.sgm
Processing: reut2-011.sgm
Processing: reut2-012.sgm
Processing: reut2-013.sgm
Processing: reut2-014.sgm
Processing: reut2-015.sgm
Processing: reut2-016.sgm
Processing: reut2-017.sgm
Processing: reut2-018.sgm
Processing: reut2-019.sgm
Processing: reut2-020.sgm
Processing: reut2-021.sgm
21578 21578


In [9]:
from collections import defaultdict #creating dictionary with topics and corresponding articles (indexes)
d = defaultdict(list)
conti = contt.copy() 
topi = topicc.copy()
for i, topic in enumerate(topi):
    if len(topic) > 1:
      for item in topic:
        d[item].append(i)
    else:
      d[topic[0]].append(i)

for t, a in d.items():
        freq = len(d[t])
        d[t] = (a, freq)

rare_topics = d.copy()  #dictionary with rare topics
rest_topics = d.copy()  #dictionary with the rest of the topics
for word, v in d.items():
    if d[word][1] >= 5:   
      del rare_topics[word]
    else:
      del rest_topics[word]

In [11]:
l = []    #removing topics with fewer than 5 articles
for topic, v in rare_topics.items():
  for elem in v[0]:
    if elem not in l:
      l.append(elem)
    else:
       pass

for number in l:
      topicc.remove(topi[number])
      contt.remove(conti[number])       

In [12]:
print(len(contt)) #length of dataset after removing rare topics
print(len(topicc))

21499
21499


In [13]:
# Searching for the best hyperparameters for the probabilistic model
import random
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(contt, topicc, test_size=0.2)

vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 3))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

m = MultiLabelBinarizer()
y_train = m.fit_transform(y_train)
y_test = m.transform(y_test)

best_hyperparameters = None
print("Learning rate:\tTraining set accuracy:")

params = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
for value in params:
    model = OneVsRestClassifier(MultinomialNB(alpha = value))
    model.fit(X_train, y_train)
    
    training_accuracy = np.sum(model.predict(X_train)==y_train)/len(y_train)

    if best_hyperparameters is None or best_hyperparameters[1] < training_accuracy:
        best_hyperparameters = (value, training_accuracy)
    print(value, training_accuracy)

best_param = best_hyperparameters[0]
print('Best parameter: ', (best_param))
    

Learning rate:	Training set accuracy:
100 78.31775103203675
10 78.42194313622885
1 78.87516716088145
0.1 78.84051398337112
0.01 78.86063143206
0.001 78.92924007209722
0.0001 78.95703238560381
Best parameter:  0.0001


In [14]:
#PROBABILISTIC MODEL

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

#Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(contt, topicc, test_size=0.2)
#Vectorizing for Multinominal model
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 3))
#vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1, 3))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

m = MultiLabelBinarizer()
y_train = m.fit_transform(y_train)
y_test = m.transform(y_test)

#training and predicting labels with probablistic model
classifier = OneVsRestClassifier(MultinomialNB(alpha=best_param))
print("Training the model...")
classifier.fit(X_train, y_train)
print("Predicting labels")
y_pred = classifier.predict(X_test)


print('F1 micro:', f1_score(y_test, y_pred, average='micro', labels=np.unique(y_pred)))
print('Hamming loss:', hamming_loss(y_test, y_pred))
print('Accuracy: %s ' %(accuracy_score(y_pred, y_test)*100))


  .format(sorted(unknown, key=str)))


Training the model...
Predicting labels
F1 micro: 0.8370279146141214
Hamming loss: 0.005793082886106142
Accuracy: 72.72093023255813 


In [15]:
# Searching for the best hyperparameters for the non-probabilistic model
import random
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(contt, topicc, test_size=0.2)

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1, 3))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

m = MultiLabelBinarizer()
y_train = m.fit_transform(y_train)
y_test = m.transform(y_test)

best_hyperparameters = None
print("Learning rate:\tTraining set accuracy:")

param = [20, 10, 5, 1, 0.1, 0.01, 0.001]
for value in param:
    model = OneVsRestClassifier(LinearSVC(C = value))
    model.fit(X_train, y_train)
    
    training_accuracy = np.sum(model.predict(X_train)==y_train)/len(y_train)

    if best_hyperparameters is None or best_hyperparameters[1] < training_accuracy:
        best_hyperparameters = (value, training_accuracy)
    print(value, training_accuracy)

best_paramNP = best_hyperparameters[0]
print('Best parameter: ', (best_paramNP))
    

Learning rate:	Training set accuracy:
20 78.99465085179371
10 78.99453456596314
5 78.99412756555614
1 78.98912727484156
0.1 78.63963021105879
0.01 78.32234432234432
0.001 78.25018896447467
Best parameter:  20


In [16]:
# NON-PROBABILISTIC MODEL

from sklearn.feature_extraction.text import TfidfVectorizer  #vectorizing for SVC model
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.svm import LinearSVC


#Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(contt, topicc, test_size=0.2)

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1, 3))
#vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 3))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

m = MultiLabelBinarizer()
y_train = m.fit_transform(y_train)
y_test = m.transform(y_test)

#training and predicting labels with non-probablistic model
classifier = OneVsRestClassifier(LinearSVC(C = best_paramNP))

print("Training the model...")
classifier.fit(X_train, y_train)
print("Predicting labels")
y_pred = classifier.predict(X_test)
print('F1 micro:', f1_score(y_test, y_pred, average='micro', labels=np.unique(y_pred)))
print('Hamming loss:', hamming_loss(y_test, y_pred))
print('Accuracy: %s ' %(accuracy_score(y_pred, y_test)*100))


Training the model...
Predicting labels
F1 micro: 0.8841166936790924
Hamming loss: 0.004038857815719753
Accuracy: 80.48837209302326 
