# **Document classifier based on Latent Semantic Analysis**

## Preprocessing and loading the data

In [None]:
'''Downloading Reuter's dataset'''
!wget -N http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
!tar zxf reuters21578.tar.gz
!ls

--2021-05-27 18:47:29--  http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘reuters21578.tar.gz’ not modified on server. Omitting download.

all-exchanges-strings.lc.txt	    reut2-002.sgm  reut2-013.sgm
all-orgs-strings.lc.txt		    reut2-003.sgm  reut2-014.sgm
all-people-strings.lc.txt	    reut2-004.sgm  reut2-015.sgm
all-places-strings.lc.txt	    reut2-005.sgm  reut2-016.sgm
all-topics-strings.lc.txt	    reut2-006.sgm  reut2-017.sgm
cat-descriptions_120396.txt	    reut2-007.sgm  reut2-018.sgm
feldman-cia-worldfactbook-data.txt  reut2-008.sgm  reut2-019.sgm
lewis.dtd			    reut2-009.sgm  reut2-020.sgm
README.txt			    reut2-010.sgm  reut2-021.sgm
reut2-000.sgm			    reut2-011.sgm  reuters21578.tar.gz
reut2-001.sgm			    reut2-012.sgm  sample_data


In [None]:
#loading the data
import re
import os
from bs4 import BeautifulSoup

cwd = '/content/'
files = os.listdir(cwd)

raw_y = []  #labels
raw_X = []  #data

for file in files:
  if file.startswith("reut2-") and file.endswith(".sgm"): 
    with open(file, 'rb') as f:

      print('Reading file: %s' % file)

      soup = BeautifulSoup(f, "html.parser")  
      articles = soup.find_all('reuters')

      for article in articles:
        topics = article.findAll('topics')
        bodies = article.findAll('text')

        if len(topics[0]) == 0:
          topics_list = ''  #no topic
        else:
          topics_list = []
          for topic in topics[0]:  #getting rid of the tags 
            topics_list.append((topic.text).lower())
          topics_list = topics_list[0] #taking only the first topic
        raw_y.append(topics_list)
        
        if len(bodies) == 0:
          raw_X.append('')
        else:
          body = (bodies[0].text).lower()
          body = re.sub('[^A-Za-z0-9]+', ' ', body) #getting rid of special char
          body = re.sub(r'\d+','nn', body)  #getting rid of numbers
          raw_X.append(body)

Reading file: reut2-012.sgm
Reading file: reut2-021.sgm
Reading file: reut2-011.sgm
Reading file: reut2-010.sgm
Reading file: reut2-016.sgm
Reading file: reut2-019.sgm
Reading file: reut2-006.sgm
Reading file: reut2-017.sgm
Reading file: reut2-008.sgm


In [None]:
print(raw_y[:5])
print(raw_X[:5])

['earn', 'wheat', 'sugar', '', 'earn']
[' republic savings and loan rsla sets dividend milwaukee wis april nn qtly div nn cts vs nn cts prior pay april nn record april nn note company s full name is republic savings and loan association of wisconsin reuter ', ' shultz ussr trip fuels talk of eep wheat offer by nelson graves reuters washington april nn speculation the united states will offer subsidized wheat to the soviet union appears to have reached a new level of intensity in the run up to secretary of state george shultz visit later this month to moscow rumors of an impending deal have coursed through wheat markets since officials from the two countries held their customary semi annual grain talks in february moscow s decision at that time to reenter the u s corn market strengthened the perception of warming farm trade prospects shultz is set to arrive in moscow april nn shultz statement two weeks ago that he would not stand in the way of a wheat subsidy offer under the export enha

In [None]:
assert len(raw_y) == len(raw_X)

## Topic Modeling

### Document Term Matrix

Applying Tf-idf to create Document-Term Matrix

In [None]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(analyzer='word',
                             stop_words='english',  #filtering out stop words
                             min_df=5, #ignore terms that have a document frequency lower than 5
                             max_df=0.95, #ignore terms that appear in 95% of documents
                             max_features=10000, #10000 most frequently appearing words
                             strip_accents='ascii')

X = vectorizer.fit_transform(raw_X) 

In [None]:
print(vectorizer.get_feature_names()[100:200]) 
print(X.shape)

['acted', 'acting', 'action', 'actions', 'active', 'actively', 'actives', 'activities', 'activity', 'acton', 'acts', 'actual', 'actually', 'acute', 'ad', 'ada', 'adams', 'adapt', 'adb', 'add', 'added', 'adding', 'addition', 'additional', 'additionally', 'additions', 'address', 'addressed', 'addressing', 'adds', 'adelaide', 'adequacy', 'adequate', 'adequately', 'adhere', 'adherence', 'adhering', 'adhesives', 'adjacent', 'adjourned', 'adjust', 'adjustable', 'adjusted', 'adjusting', 'adjustment', 'adjustments', 'adjusts', 'adm', 'administered', 'administration', 'administrative', 'administrator', 'admission', 'admit', 'admitted', 'admitting', 'adobe', 'adopt', 'adopted', 'adopting', 'adoption', 'adopts', 'adr', 'ads', 'advance', 'advanced', 'advances', 'advancing', 'advantage', 'advantageous', 'advantages', 'adventure', 'adverse', 'adversely', 'advertisement', 'advertising', 'advest', 'advice', 'advise', 'advised', 'adviser', 'advisers', 'advisor', 'advisors', 'advisory', 'advocate', 'adv

### Singular Value Decomposition 

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=130, random_state=1) 
lsa = svd.fit(X)

In [None]:
terms = vectorizer.get_feature_names()

for i, component in enumerate(lsa.components_):
  terms_components = zip(terms, component)
  sorted_terms = sorted(terms_components, key=lambda x:x[1], reverse=True)[:5]
  temp =[t[0] for t in sorted_terms]
  print(f'Topic {i+1}: {temp}\n')

Topic 1: ['vs', 'cts', 'mln', 'net', 'loss']

Topic 2: ['blah', 'pct', 'says', 'said', 'dlrs']

Topic 3: ['said', 'pct', 'dlrs', 'mln', 'billion']

Topic 4: ['cts', 'qtly', 'div', 'record', 'prior']

Topic 5: ['loss', 'profit', 'said', 'company', 'shares']

Topic 6: ['shares', 'mln', 'stock', 'common', 'company']

Topic 7: ['pct', 'mln', 'dlrs', 'loss', 'february']

Topic 8: ['billion', 'dlrs', 'quarter', 'div', 'qtly']

Topic 9: ['bank', 'billion', 'stg', 'loss', 'debt']

Topic 10: ['mln', 'stg', 'tonnes', 'bank', 'div']

Topic 11: ['shares', 'stg', 'stock', 'billion', 'tonnes']

Topic 12: ['tonnes', 'billion', 'wheat', 'bond', 'trade']

Topic 13: ['oper', 'dlrs', 'tonnes', 'share', 'stock']

Topic 14: ['japan', 'trade', 'yen', 'mln', 'japanese']

Topic 15: ['mln', 'debt', 'offering', 'february', 'debentures']

Topic 16: ['oil', 'crude', 'prices', 'opec', 'gas']

Topic 17: ['maynn', 'julnn', 'sepnn', 'sales', 'decnn']

Topic 18: ['maynn', 'julnn', 'sepnn', 'untrd', 'decnn']

Topic 19:

In [None]:
len(lsa.components_)

130

## Classification

Grid Search for best parameters

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

In [None]:
# Splitting the data X and y into training and testing sets with the testing size
X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, test_size = 0.2)

In [None]:
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

In [None]:
#Grid search

#Hyperparameters 
components = [10, 20, 50, 100, 120, 150, 170, 200, 220, 250]
value_neighbors = [3, 5, 7, 10]

#dataframe
results = []

#vectorizer 
vectorizer = TfidfVectorizer(analyzer='word',
                             stop_words='english', #filtering out stop words
                             min_df=5, #ignore terms that have a document frequency lower than 5
                             max_df=0.95, #ignore terms that appear in 95% of documents
                             max_features=10000, #10000 most frequently appearing words
                             strip_accents='ascii')

#grid search
for value in components:
  for v in value_neighbors:

    print(f'Training with hyperparameters: number of components [{value}], number of neigbors [{v}].')

    svd = TruncatedSVD(n_components=value, random_state=1)
    X_train_vectorized = vectorizer.fit_transform(X_train)
    #X_test_vectorized = vectorizer.transform(X_test)

    X_train_lsa = svd.fit_transform(X_train_vectorized)
    #X_test_lsa = svd.transform(X_test_vectorized)

    knn_lsa = KNeighborsClassifier(n_neighbors=v)
    knn_lsa.fit(X_train_lsa, y_train)


    predicted = knn_lsa.predict(X_train_lsa)

    acc = (np.sum(predicted==y_train)/len(y_train)) *100
           
    #dataframe
    results.append({
                  'number of components': value,
                  'number of neigbors': v,
                  'correct labels': f'{right_labels}/{len(y_train)}',
                  'Accuracy': ("%.2f%%" % (acc))
                    })


Training with hyperparameters: number of components [10], number of neigbors [3].
Training with hyperparameters: number of components [10], number of neigbors [5].
Training with hyperparameters: number of components [10], number of neigbors [7].
Training with hyperparameters: number of components [10], number of neigbors [10].
Training with hyperparameters: number of components [20], number of neigbors [3].
Training with hyperparameters: number of components [20], number of neigbors [5].
Training with hyperparameters: number of components [20], number of neigbors [7].
Training with hyperparameters: number of components [20], number of neigbors [10].
Training with hyperparameters: number of components [50], number of neigbors [3].
Training with hyperparameters: number of components [50], number of neigbors [5].
Training with hyperparameters: number of components [50], number of neigbors [7].
Training with hyperparameters: number of components [50], number of neigbors [10].
Training with

In [None]:
import pandas as pd
results = pd.DataFrame(results)
results = results.sort_values(by='Accuracy', ascending=False)

#table of hyperparameters
#sorted by accuracy
print('------------------------------------------------------------------------')
print('Results of training hyperparameters')
print('------------------------------------------------------------------------')
print(results)
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Results of training hyperparameters
------------------------------------------------------------------------
    number of components  number of neigbors correct labels Accuracy
20                   150                   3     3450/17262   88.93%
32                   220                   3     3450/17262   88.83%
28                   200                   3     3450/17262   88.80%
24                   170                   3     3450/17262   88.79%
16                   120                   3     3450/17262   88.74%
36                   250                   3     3450/17262   88.73%
12                   100                   3     3450/17262   88.66%
8                     50                   3     3450/17262   88.01%
29                   200                   5     3450/17262   86.62%
25                   170                   5     3450/17262   86.55%
21                   150                   5     3450/17262

In [None]:
#Final model with the best hyperparameters

svd = TruncatedSVD(n_components=150, random_state=1) 
vectorizer = TfidfVectorizer(analyzer='word',
                             stop_words='english', #filtering out stop words
                             min_df=5, #ignore terms that have a document frequency lower than 5
                             max_df=0.95, #ignore terms that appear in 95% of documents
                             max_features=10000, #10000 most frequently appearing words
                             strip_accents='ascii')

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

X_train_lsa = svd.fit_transform(X_train_vectorized)
X_test_lsa = svd.transform(X_test_vectorized)


knn_lsa = KNeighborsClassifier(n_neighbors=3)
knn_lsa.fit(X_train_lsa, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [None]:
print('Predicting labels...')
predicted_labels = knn_lsa.predict(X_test_lsa)

test_accuracy = (np.sum(predicted_labels==y_test)/len(y_test)) *100
print('----------------------------------------------------------------------')
print(f'Test accuracy: {("%.4f%%" % (test_accuracy))}')

Predicting labels...
----------------------------------------------------------------------
Test accuracy: 80.8387%


## Evaluation

In [None]:
import warnings
from sklearn.model_selection import cross_val_score

with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  
  X_vectorized = vectorizer.fit_transform(raw_X)

  X_lsa = svd.fit_transform(X_vectorized)
  
  scores = cross_val_score(knn_lsa, X_lsa, raw_y, cv=5)

In [None]:
#Crossvalidation score
print('----------------------------------------------------------------------')
print('Crossvalidation:')
print(scores)
print('----------------------------------------------------------------------')

----------------------------------------------------------------------
Crossvalidation:
[0.79170528 0.78568119 0.78081557 0.801854   0.80440324]
----------------------------------------------------------------------


In [None]:
from sklearn.metrics import classification_report

with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  report = classification_report(y_test, predicted_labels)

  
print('----------------------------------------------------------------------')
print('Classification report:')
print('----------------------------------------------------------------------')
print(report)

----------------------------------------------------------------------
Classification report:
----------------------------------------------------------------------
                 precision    recall  f1-score   support

                      0.82      0.90      0.85      2058
            acq       0.75      0.72      0.73       442
           alum       1.00      0.27      0.43        11
            bop       0.50      0.40      0.44        10
        carcass       0.25      0.20      0.22         5
          cocoa       0.92      1.00      0.96        12
         coffee       0.88      1.00      0.94        23
         copper       1.00      0.60      0.75        10
           corn       0.00      0.00      0.00         1
         cotton       0.00      0.00      0.00         7
            cpi       0.62      0.48      0.54        21
            cpu       0.00      0.00      0.00         3
          crude       0.83      0.79      0.81       108
            dlr       0.50      0.17

## Baseline model


In [None]:
#simple baseline model

vectorizer = TfidfVectorizer()

X_train_baseline= vectorizer.fit_transform(X_train)
X_test_baseline= vectorizer.transform(X_test)

knn_baseline = KNeighborsClassifier()
knn_baseline.fit(X_train_baseline, y_train)

#predict

baseline_preds = knn_baseline.predict(X_test_baseline)

#accuracy

baseline_accuracy = (np.sum(baseline_preds==y_test)/len(y_test)) *100

In [None]:
print('----------------------------------------------------------------------')
print(f'Test accuracy: {("%.2f%%" % (baseline_accuracy))}')
print('----------------------------------------------------------------------')

----------------------------------------------------------------------
Test accuracy: 77.94%
----------------------------------------------------------------------
