In [17]:
import numpy as np

# to time models
from timeit import default_timer as timer

#sklearn essentials
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, plot_confusion_matrix, plot_precision_recall_curve

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import FitFailedWarning, ConvergenceWarning

# Fetch_20newsgroups

In [22]:
# Loading Data!

from sklearn.datasets import fetch_20newsgroups

# Splitting Data
newsgroup_info = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
ng_x_train, ng_x_test, ng_y_train, ng_y_test = train_test_split(newsgroup_info.data, newsgroup_info.target, 
                                                        test_size=0.2, random_state=0)

## Probabilistic Model - Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier

rfc_ng_model = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True, norm='l2')),
    ('classifier', RandomForestClassifier(max_depth=10)) # to reduce time...
                    ], verbose=True)

In [20]:
print("Random Forest Classifier")
print()
start = timer()
rfc_ng_model.fit(ng_x_train, ng_y_train)
end = timer()
print()
print("Time taken in seconds to fit the model!", round(end-start, 5))

Random Forest Classifier

[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   9.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   1.0s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   4.2s

Time taken in seconds to fit the model! 14.40766


In [21]:
rfc_ng_y_pred = rfc_ng_model.predict(ng_x_test)
print(classification_report(ng_y_test, rfc_ng_y_pred, target_names=newsgroup_info.target_names, zero_division='warn'))
print('\n\n')
print("Using random forest classifier yieled an accuracy of --->", rfc_ng_model.score(ng_x_test, ng_y_test))
print("Using random forest classifier yieled a micro f1-score of --->", f1_score(ng_y_test, rfc_ng_y_pred, average='micro'))
print("Using random forest classifier yieled a macro f1-score of --->", f1_score(ng_y_test, rfc_ng_y_pred, average='macro'))

  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

             alt.atheism       1.00      0.07      0.14       163
           comp.graphics       0.55      0.54      0.55       190
 comp.os.ms-windows.misc       0.60      0.41      0.49       200
comp.sys.ibm.pc.hardware       0.57      0.60      0.58       196
   comp.sys.mac.hardware       0.77      0.58      0.66       201
          comp.windows.x       0.66      0.63      0.64       198
            misc.forsale       0.78      0.61      0.68       206
               rec.autos       0.14      0.66      0.23       177
         rec.motorcycles       0.62      0.58      0.60       189
      rec.sport.baseball       0.38      0.70      0.50       171
        rec.sport.hockey       0.86      0.67      0.75       233
               sci.crypt       0.71      0.66      0.68       190
         sci.electronics       0.60      0.23      0.33       207
                 sci.med       0.82      0.57      0.67       203
         

### Random Forest Classifier - Grid Search

In [None]:
parameters_rfc_ng = {
    # 'vectorizer__stop_words' : ('english', None),
    # 'vectorizer__ngram_range' : ((1, 1), (1, 2)), 
    # 'vectorizer__max_df' : (0.5, 0.75, 1.0),
    # 'vectorizer__max_features' : (5000, 10000, 50000),
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm' : ('l1', 'l2'),
    'classifier__n_estimators' : (100, 200, 300),
    'classifier__max_depth' : (25, 50, 100),
    'classifier__bootstrap' : (True, False)
}

In [None]:
grid_search_rfc_ng = GridSearchCV(rfc_ng_model, parameters_rfc_ng, cv=2, n_jobs=1)
start = timer()
grid_search_rfc_ng = grid_search_rfc_ng.fit(ng_x_train, ng_y_train)
end = timer()
print("Time taken to execute Grid search for Random Forest Classifieri!", round(end-start)) 

[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.7s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   7.7s
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   7.8s
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.5s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  15.3s
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.3s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  16.3s
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.6s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipel

In [None]:
print("Best Score for random forest classifier: ", grid_search_rfc_ng.best_score_)
print()
print("Best parameters:", grid_search_rfc_ng.best_params_)

Best Score for Multinomial Naive Bayes:  0.6671530910055719

Best parameters: {'classifier__bootstrap': False, 'classifier__max_depth': 100, 'classifier__n_estimators': 300}


In [None]:
grid_search_rfc_ng_y_pred = grid_search_rfc_ng.predict(ng_x_test)
print('SCORES WITH RANDOM FOREST CLASSIFIER AFTER GRID SEARCH')
print('\n\n')
print(classification_report(ng_y_test, grid_search_rfc_ng_y_pred, target_names=newsgroup_info.target_names, zero_division='warn'))
print('\n\n')
print("Using random forest classifier yieled an accuracy of --->", grid_search_rfc_ng.score(ng_x_test, ng_y_test))
print("Using random forest classifier a micro f1-score of --->", f1_score(ng_y_test, grid_search_rfc_ng_y_pred, average='micro'))
print("Using random forest classifier a macro f1-score of --->", f1_score(ng_y_test, grid_search_rfc_ng_y_pred, average='macro'))

SCORES WITH RANDOM FOREST CLASSIFIER AFTER GRID SEARCH



                          precision    recall  f1-score   support

             alt.atheism       0.73      0.42      0.53       163
           comp.graphics       0.62      0.64      0.63       190
 comp.os.ms-windows.misc       0.64      0.65      0.64       200
comp.sys.ibm.pc.hardware       0.65      0.65      0.65       196
   comp.sys.mac.hardware       0.82      0.67      0.74       201
          comp.windows.x       0.80      0.77      0.78       198
            misc.forsale       0.79      0.77      0.78       206
               rec.autos       0.56      0.72      0.63       177
         rec.motorcycles       0.32      0.78      0.45       189
      rec.sport.baseball       0.83      0.81      0.82       171
        rec.sport.hockey       0.85      0.85      0.85       233
               sci.crypt       0.83      0.72      0.77       190
         sci.electronics       0.78      0.57      0.66       207
                 

## Non-Probabilistic Model - Linear Support Vector Classification

In [3]:
from sklearn.svm import LinearSVC

# Linear Support Vector Classification - this should have worked better....right?

lsvc_ng_model = Pipeline([
                    ('vectorizer', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
                    ('tfidf', TfidfTransformer(use_idf=True, norm='l2')),
                    ('classifier', LinearSVC())  
                    ], verbose=True)

In [10]:
print("Linear Support Vector Classifier")
print()
start = timer()
lsvc_ng_model.fit(ng_x_train, ng_y_train)
end = timer()
print()
print('Time taken in seconds to fit the model!', round(end-start, 5))

Linear Support Vector Classifier

[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   8.6s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   1.0s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  10.8s

Time taken in seconds to fit the model! 20.41869


In [11]:
lsvc_ng_y_pred = lsvc_ng_model.predict(ng_x_test)
print(classification_report(ng_y_test, lsvc_ng_y_pred, target_names=newsgroup_info.target_names, zero_division='warn'))
print('\n\n')
print("Using Linear Support Vector Classification yieled an accuracy of --->", lsvc_ng_model.score(ng_x_test, ng_y_test))
print("Using support vector classification yielded a micro f1-score of --->", f1_score(ng_y_test, lsvc_ng_y_pred, average='micro'))
print("Using support vector classification yielded a micro f1-score of --->", f1_score(ng_y_test, lsvc_ng_y_pred, average='macro'))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.64      0.68       163
           comp.graphics       0.71      0.75      0.73       190
 comp.os.ms-windows.misc       0.73      0.69      0.71       200
comp.sys.ibm.pc.hardware       0.72      0.69      0.71       196
   comp.sys.mac.hardware       0.81      0.74      0.78       201
          comp.windows.x       0.83      0.83      0.83       198
            misc.forsale       0.80      0.83      0.82       206
               rec.autos       0.49      0.83      0.62       177
         rec.motorcycles       0.86      0.80      0.83       189
      rec.sport.baseball       0.87      0.87      0.87       171
        rec.sport.hockey       0.95      0.89      0.92       233
               sci.crypt       0.87      0.79      0.83       190
         sci.electronics       0.77      0.76      0.77       207
                 sci.med       0.86      0.85      0.85       203
         

### Linear Support Vector Classification - Grid Search

In [12]:
parameters_lsvc_ng = {
    # 'vectorizer__stop_words' : ('english', None),
    # 'vectorizer__ngram_range' : ((1, 1), (1, 2)),
    # 'vectorizer__max_df' : (0.5, 0.75, 1.0),
    # 'vectorizer__max_features' : (None, 5000, 10000, 50000),
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm' : ('l1', 'l2'),
    # 'classifier__penalty' : ('l1', 'l2'),
    'classifier__loss' : ('hinge', 'squared_hinge'),
    # 'classifier__dual' : (False, True),
    'classifier__multi_class' : ('ovr', 'crammer_singer'),
    'classifier__max_iter' : (1000, 2000, 5000)
}

In [14]:
grid_search_lsvc_ng = GridSearchCV(lsvc_ng_model, parameters_lsvc_ng, cv=2, n_jobs=1, verbose=3)

start = timer()
grid_search_lsvc_ng = grid_search_lsvc_ng.fit(ng_x_train, ng_y_train)
end = timer()
print("Time taken to execute Grid search for Support Vector Classification!", round(end-start))

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] classifier__loss=hinge, classifier__max_iter=1000, classifier__multi_class=ovr 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total=  19.7s
[CV]  classifier__loss=hinge, classifier__max_iter=1000, classifier__multi_class=ovr, score=0.736, total=  26.6s
[CV] classifier__loss=hinge, classifier__max_iter=1000, classifier__multi_class=ovr 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.6s remaining:    0.0s


[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total=  16.6s
[CV]  classifier__loss=hinge, classifier__max_iter=1000, classifier__multi_class=ovr, score=0.738, total=  23.1s
[CV] classifier__loss=hinge, classifier__max_iter=1000, classifier__multi_class=crammer_singer 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   49.8s remaining:    0.0s


[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.3s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.3s




[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.2min
[CV]  classifier__loss=hinge, classifier__max_iter=1000, classifier__multi_class=crammer_singer, score=0.743, total= 1.3min
[CV] classifier__loss=hinge, classifier__max_iter=1000, classifier__multi_class=crammer_singer 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.3min
[CV]  classifier__loss=hinge, classifier__max_iter=1000, classifier__multi_class=crammer_singer, score=0.739, total= 2.4min
[CV] classifier__loss=hinge, classifier__max_iter=2000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  20.7s
[CV]  classifier__loss=hinge, classifier__max_iter=2000, classifier__multi_class=ovr, score=0.734, total=  27.4s
[CV] classifier__loss=hinge, classifier__max_iter=2000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total=  16.4s
[CV]  classifier__loss=hinge, classifier__max_iter=2000, classifier__multi_class=ovr, score=0.738, total=  22.9s
[CV] classifier__loss=hinge, classifier__max_iter=2000, classifier__multi_class=crammer_singer 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.3s




[Pipeline] ........ (step 3 of 3) Processing classifier, total=  53.1s
[CV]  classifier__loss=hinge, classifier__max_iter=2000, classifier__multi_class=crammer_singer, score=0.742, total=  59.9s
[CV] classifier__loss=hinge, classifier__max_iter=2000, classifier__multi_class=crammer_singer 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.3s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.2min
[CV]  classifier__loss=hinge, classifier__max_iter=2000, classifier__multi_class=crammer_singer, score=0.739, total= 2.3min
[CV] classifier__loss=hinge, classifier__max_iter=5000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.3s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  23.6s
[CV]  classifier__loss=hinge, classifier__max_iter=5000, classifier__multi_class=ovr, score=0.736, total=  30.3s
[CV] classifier__loss=hinge, classifier__max_iter=5000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  16.8s
[CV]  classifier__loss=hinge, classifier__max_iter=5000, classifier__multi_class=ovr, score=0.74



[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.4min
[CV]  classifier__loss=hinge, classifier__max_iter=5000, classifier__multi_class=crammer_singer, score=0.743, total= 1.5min
[CV] classifier__loss=hinge, classifier__max_iter=5000, classifier__multi_class=crammer_singer 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.9min
[CV]  classifier__loss=hinge, classifier__max_iter=5000, classifier__multi_class=crammer_singer, score=0.739, total= 3.0min
[CV] classifier__loss=squared_hinge, classifier__max_iter=1000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.5s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   4.4s
[CV]  classifier__loss=squared_hinge, classifier__max_iter=1000, classifier__multi_class=ovr, score=0.742, total=  11.2s
[CV] classifier__loss=squared_hinge, classifier__max_iter=1000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.3s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   4.5s
[CV]  classifier__loss=squared_hinge, classifier__max_iter=1000, classif



[Pipeline] ........ (step 3 of 3) Processing classifier, total= 1.2min
[CV]  classifier__loss=squared_hinge, classifier__max_iter=1000, classifier__multi_class=crammer_singer, score=0.743, total= 1.4min
[CV] classifier__loss=squared_hinge, classifier__max_iter=1000, classifier__multi_class=crammer_singer 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.2min
[CV]  classifier__loss=squared_hinge, classifier__max_iter=1000, classifier__multi_class=crammer_singer, score=0.738, total= 2.3min
[CV] classifier__loss=squared_hinge, classifier__max_iter=2000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   4.4s
[CV]  classifier__loss=squared_hinge, classifier__max_iter=2000, classifier__multi_class=ovr, score=0.742, total=  11.2s
[CV] classifier__loss=squared_hinge, classifier__max_iter=2000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.3s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   4.5s
[CV]  classifier__loss=squared_hinge, classifier__max_iter=2000,



[Pipeline] ........ (step 3 of 3) Processing classifier, total=  58.0s
[CV]  classifier__loss=squared_hinge, classifier__max_iter=2000, classifier__multi_class=crammer_singer, score=0.742, total= 1.1min
[CV] classifier__loss=squared_hinge, classifier__max_iter=2000, classifier__multi_class=crammer_singer 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.0min
[CV]  classifier__loss=squared_hinge, classifier__max_iter=2000, classifier__multi_class=crammer_singer, score=0.739, total= 2.1min
[CV] classifier__loss=squared_hinge, classifier__max_iter=5000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   4.5s
[CV]  classifier__loss=squared_hinge, classifier__max_iter=5000, classifier__multi_class=ovr, score=0.742, total=  11.3s
[CV] classifier__loss=squared_hinge, classifier__max_iter=5000, classifier__multi_class=ovr 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   4.3s
[CV]  classifier__loss=squared_hinge, classifier__max_iter=5000,



[Pipeline] ........ (step 3 of 3) Processing classifier, total=  42.9s
[CV]  classifier__loss=squared_hinge, classifier__max_iter=5000, classifier__multi_class=crammer_singer, score=0.743, total=  49.7s
[CV] classifier__loss=squared_hinge, classifier__max_iter=5000, classifier__multi_class=crammer_singer 
[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   4.2s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.4s




[Pipeline] ........ (step 3 of 3) Processing classifier, total= 2.8min
[CV]  classifier__loss=squared_hinge, classifier__max_iter=5000, classifier__multi_class=crammer_singer, score=0.739, total= 2.9min


[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 25.7min finished


[Pipeline] ........ (step 1 of 3) Processing vectorizer, total=   8.4s
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 5.8min
Time taken to execute Grid search for Support Vector Classification! 1897




In [15]:
print("Best Score:", grid_search_lsvc_ng.best_score_)
print()
print("Best Parameters", grid_search_lsvc_ng.best_params_)

Best Score: 0.7409127089413637

Best Parameters {'classifier__loss': 'hinge', 'classifier__max_iter': 1000, 'classifier__multi_class': 'crammer_singer'}


In [16]:
grid_search_lsvc_ng_y_pred = grid_search_lsvc_ng.predict(ng_x_test)
print('SCORES WITH LINEAR SUPPORT VECTOR CLASSIFICATION AFTER GRID SEARCH')
print('\n\n')
print(classification_report(ng_y_test, grid_search_lsvc_ng_y_pred, target_names=newsgroup_info.target_names, zero_division='warn'))
print('\n\n')
print("Using linear support vector classification yieled an accuracy of --->", grid_search_lsvc_ng.score(ng_x_test, ng_y_test))
print("Using linear support vector classification a micro f1-score of --->", f1_score(ng_y_test, grid_search_lsvc_ng_y_pred, average='micro'))
print("Using linear support vector classification a macro f1-score of --->", f1_score(ng_y_test, grid_search_lsvc_ng_y_pred, average='macro'))

SCORES WITH RANDOM FOREST CLASSIFIER AFTER GRID SEARCH



                          precision    recall  f1-score   support

             alt.atheism       0.73      0.66      0.69       163
           comp.graphics       0.71      0.75      0.73       190
 comp.os.ms-windows.misc       0.72      0.68      0.70       200
comp.sys.ibm.pc.hardware       0.74      0.69      0.71       196
   comp.sys.mac.hardware       0.81      0.74      0.78       201
          comp.windows.x       0.83      0.84      0.84       198
            misc.forsale       0.81      0.83      0.82       206
               rec.autos       0.78      0.80      0.79       177
         rec.motorcycles       0.85      0.80      0.83       189
      rec.sport.baseball       0.84      0.87      0.86       171
        rec.sport.hockey       0.95      0.88      0.91       233
               sci.crypt       0.89      0.79      0.84       190
         sci.electronics       0.79      0.78      0.79       207
                 

# Reatures Dataset

## Loading and Parsing Data

In [6]:
# Loading Data

!wget -N http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz

--2021-06-13 17:49:29--  http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8150596 (7.8M) [application/x-gzip]
Saving to: ‘reuters21578.tar.gz’


2021-06-13 17:49:31 (3.02 MB/s) - ‘reuters21578.tar.gz’ saved [8150596/8150596]



In [7]:
# Parsing Data
from bs4 import BeautifulSoup as bs
from collections import defaultdict 

xml_tags = [ 'title', 'date', 'dateline']

attribute_values = ['cgisplit', 'lewissplit', 'newid', 'oldid', 'topics']

d_xml_tags = ['topics', 'places', 'orgs', 'exchanges', 'companies']

info_dict = defaultdict(lambda: defaultdict(lambda: '') )

# counter used to name different documents (title can be used instead, working 
# with this for now)

counter = 0 

files = !tar xvzf reuters21578.tar.gz

for file in files[9:]:
    with open(file, 'rb') as f2:
        if file.endswith('.sgm'):
            # file = file[6:9]
            filecontent = f2.readlines()
            filecontent = b"".join(filecontent)

            # BS whole xml file
            soup = bs(filecontent, 'lxml')

            # Reuters are used to delimit documents....
            documents = soup.find_all('reuters')

            # Three different XML extracts...
            # (i) parsing date tags... e.g. <date> </date>
            for doc in documents:
                counter += 1
                for tag1 in xml_tags:
                    try:
                        # (doc.find('title').text)
                        to_add = doc.find(tag1).text.strip()
                        info_dict[counter][tag1] += to_add

                    except AttributeError:
                        info_dict[counter][tag1] += str(None)

            # # (ii) parsing attribute values e.g. cgisplit='example'
                # for attribute in attribute_values:
                #     try:
                #         to_add = doc.get(attribute)
                #         info_dict[counter][attribute] += (to_add)
                #     except:
                #         info_dict[counter][attribute] += str(None)

            # (iii) parsing child tags, i.e. find in find_all.. (hierarchical tags)
                for tag2 in d_xml_tags:
                    to_concatenate = [elem.text for elem in doc.find(tag2).findAll('d')]
                    if len(to_concatenate) > 0:
                        info_dict[counter][tag2] += ' '.join(to_concatenate)
                    else:
                        if tag2 != 'topics':
                            info_dict[counter][tag2] += str(None)

            # finally extracting texts..
                text_tag = doc.find('text')
                text = text_tag.text.strip()
                
                try:
                    text = text.replace(text_tag.title.text, '')
                except:
                    pass
                try:
                    text = text.replace(text_tag.dateline.text, '')
                except:
                    pass
                text = text.replace('\n', ' ')
                info_dict[counter]['text'] += text

In [9]:
# Appending the raw texts and labels (places) to lists
X_raw = [] 
labels = []

for key, value in info_dict.items():
    labels.append(value['places'].split())
    X_raw.append(value['text'])

## Probabilsitc Model - Random Forest Classifier

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_raw)

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)

rfc_reut_model = RandomForestClassifier(max_depth=10)

reut_x_train, reut_x_test, reut_y_train, reut_y_test = train_test_split(X, y, 
                                                        test_size=0.33, random_state=0)

# https://github.com/davidsbatista/text-classification/issues/1

In [7]:
print("Random Forest Classifier")
print()
start = timer()
rfc_reut_model.fit(reut_x_train, reut_y_train)
end = timer()
print()
print("Time taken in seconds to fit the model!", round(end-start, 5))

Random Forest Classifier


Time taken in seconds to fit the model! 30.61041


In [8]:
rfc_reut_y_pred = rfc_reut_model.predict(reut_x_test)
print(classification_report(reut_y_test, rfc_reut_y_pred, target_names=mlb.classes_, zero_division='warn'))
print('\n\n')
print("Usireut random forest classifier yieled an accuracy of --->", rfc_reut_model.score(reut_x_test, reut_y_test))
print("Usireut random forest classifier yieled a micro f1-score of --->", f1_score(reut_y_test, rfc_reut_y_pred, average='micro'))
print("Usireut random forest classifier yieled a macro f1-score of --->", f1_score(reut_y_test, rfc_reut_y_pred, average='macro'))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

               None       1.00      0.01      0.02       884
        afghanistan       0.00      0.00      0.00         1
            algeria       0.00      0.00      0.00        11
             angola       0.00      0.00      0.00         1
            antigua       0.00      0.00      0.00         0
          argentina       0.00      0.00      0.00        28
              aruba       0.00      0.00      0.00         1
          australia       0.00      0.00      0.00        85
            austria       0.00      0.00      0.00         6
            bahamas       0.00      0.00      0.00         0
            bahrain       0.00      0.00      0.00        13
         bangladesh       0.00      0.00      0.00        11
           barbados       0.00      0.00      0.00         0
            belgium       0.00      0.00      0.00        69
              benin       0.00      0.00      0.00         0
            bermuda    

  average, "true nor predicted", 'F-score is', len(true_sum)


### Random Forest Classifier - Grid Search

In [10]:
print('Available parameters to use for grid search----> ', rfc_reut_model.get_params().keys())
parameters_rfc_reut = {
    'bootstrap' : (True, False),
    'n_estimators' : (100, 200, 500),
    'max_depth' : (25, 50, 100)
}

Available parameters to use for grid search---->  dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])


In [14]:
grid_search_rfc_reut = GridSearchCV(rfc_reut_model, parameters_rfc_reut, cv=2, n_jobs=1, verbose=3)
start = timer()
grid_search_rfc_reut = grid_search_rfc_reut.fit(reut_x_train, reut_y_train)
end = timer()
print("Time taken to execute Grid search for Random Forest Classifier!", round(end-start)) 

Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] bootstrap=True, max_depth=25, n_estimators=100 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=True, max_depth=25, n_estimators=100, score=0.582, total=  35.1s
[CV] bootstrap=True, max_depth=25, n_estimators=100 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.1s remaining:    0.0s


[CV]  bootstrap=True, max_depth=25, n_estimators=100, score=0.592, total=  35.1s
[CV] bootstrap=True, max_depth=25, n_estimators=200 ..................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  bootstrap=True, max_depth=25, n_estimators=200, score=0.579, total= 1.2min
[CV] bootstrap=True, max_depth=25, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=25, n_estimators=200, score=0.590, total= 1.1min
[CV] bootstrap=True, max_depth=25, n_estimators=500 ..................
[CV]  bootstrap=True, max_depth=25, n_estimators=500, score=0.581, total= 2.7min
[CV] bootstrap=True, max_depth=25, n_estimators=500 ..................
[CV]  bootstrap=True, max_depth=25, n_estimators=500, score=0.590, total= 2.8min
[CV] bootstrap=True, max_depth=50, n_estimators=100 ..................
[CV]  bootstrap=True, max_depth=50, n_estimators=100, score=0.601, total=  48.9s
[CV] bootstrap=True, max_depth=50, n_estimators=100 ..................
[CV]  bootstrap=True, max_depth=50, n_estimators=100, score=0.614, total=  48.5s
[CV] bootstrap=True, max_depth=50, n_estimators=200 ..................
[CV]  bootstrap=True, max_depth=50, n_estimators=200, score=0.603, total= 1.6min
[CV] bo

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 86.7min finished


Time taken to execute Grid search for Random Forest Classifier! 5528


In [15]:
grid_search_rfc_reut_y_pred = grid_search_rfc_reut.predict(reut_x_test)
print('SCORES WITH RANDOM FOREST CLASSIFIER AFTER GRID SEARCH')
print('\n\n')
print(classification_report(reut_y_test, grid_search_rfc_reut_y_pred, zero_division='warn'))
print('\n\n')
print("Using random forest classifier yieled an accuracy of --->", grid_search_rfc_reut.score(reut_x_test, reut_y_test))
print("Using random forest classifier0 a micro f1-score of --->", f1_score(reut_y_test, grid_search_rfc_reut_y_pred, average='micro'))
print("Using random forest classifier a macro f1-score of --->", f1_score(reut_y_test, grid_search_rfc_reut_y_pred, average='macro'))

SCORES WITH RANDOM FOREST CLASSIFIER AFTER GRID SEARCH



              precision    recall  f1-score   support

           0       0.87      0.81      0.84       884
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00        11
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         0
           5       1.00      0.07      0.13        28
           6       0.00      0.00      0.00         1
           7       1.00      0.08      0.15        85
           8       0.00      0.00      0.00         6
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00        13
          11       1.00      0.09      0.17        11
          12       0.00      0.00      0.00         0
          13       0.83      0.07      0.13        69
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         2
          16       0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Using multinomial naive bayes yieled an accuracy of ---> 0.6476618452464542
Using multinomial naive bayes a micro f1-score of ---> 0.7222612853325833
Using multinomial naive bayes a macro f1-score of ---> 0.08251650316175263


  average, "true nor predicted", 'F-score is', len(true_sum)


## Non-Probabilistic Model - Linear Support Vector Classification

In [32]:
lsvc_reut_model = LinearSVC()

tf_vectorizer = TfidfVectorizer() # stop_words='english', ngram_range=(1, 2), norm='l2')
X = tf_vectorizer.fit_transform(X_raw)

le = LabelEncoder()
string_labels = [', '.join(ele) for ele in labels]
y = le.fit_transform(string_labels)


reut_x_train, reut_x_test, reut_y_train, reut_y_test = train_test_split(X, y, 
                                                        test_size=0.33, random_state=0)

# https://github.com/davidsbatista/text-classification/issues/1

In [33]:
print("Linear Support Vector Classifier")
print()
start = timer()
lsvc_reut_model.fit(reut_x_train, reut_y_train)
end = timer()
print()
print('Time taken in seconds to fit the model!', round(end-start, 5))

Linear Support Vector Classifier


Time taken in seconds to fit the model! 43.57174


In [34]:
lsvc_reut_y_pred = lsvc_reut_model.predict(reut_x_test)
print(classification_report(reut_y_test, lsvc_reut_y_pred,zero_division='warn'))
print('\n\n')
print("Using Linear Support Vector Classification yieled an accuracy of --->", lsvc_reut_model.score(reut_x_test, reut_y_test))
print("Using support vector classification yielded a micro f1-score of --->", f1_score(reut_y_test, lsvc_reut_y_pred, average='micro'))
print("Using support vector classification yielded a micro f1-score of --->", f1_score(reut_y_test, lsvc_reut_y_pred, average='macro'))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.83      0.87      0.85       884
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         0
           7       0.88      0.58      0.70        12
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         1
          13       1.00      1.00      1.00         1
          14       0.00      0.00      0.00         1
          15       0.87      0.87      0.87        60
          16       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         0
          21       0.00      0.00      0.00         1
          22       0.00    

### Linear Support Vector Classification - Grid Search

In [35]:
print('Available parameters to use for grid search----> ', lsvc_reut_model.get_params().keys())

Available parameters to use for grid search---->  dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'])


In [36]:
parameters_lsvc_reut = {
    'C' : (1, 2),
    # 'penalty' : ('l1', 'l2'),
    'multi_class' : ('ovr', 'crammer_singer'),
    'loss' : ('hinge', 'squared_hinge')
}

In [37]:
grid_search_lsvc_reut = GridSearchCV(lsvc_reut_model, parameters_lsvc_reut, cv=2, n_jobs=1, verbose=3)

start = timer()
grid_search_lsvc_reut = grid_search_lsvc_reut.fit(reut_x_train, reut_y_train)
end = timer()
print("Time taken to execute Grid search for Support Vector Classification!", round(end-start))

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] C=1, loss=hinge, multi_class=ovr ................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... C=1, loss=hinge, multi_class=ovr, score=0.780, total=  18.6s
[CV] C=1, loss=hinge, multi_class=ovr ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.6s remaining:    0.0s


[CV] .... C=1, loss=hinge, multi_class=ovr, score=0.773, total=  14.9s
[CV] C=1, loss=hinge, multi_class=crammer_singer .....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   33.5s remaining:    0.0s


[CV]  C=1, loss=hinge, multi_class=crammer_singer, score=0.789, total=  43.6s
[CV] C=1, loss=hinge, multi_class=crammer_singer .....................
[CV]  C=1, loss=hinge, multi_class=crammer_singer, score=0.783, total=  44.7s
[CV] C=1, loss=squared_hinge, multi_class=ovr ........................
[CV]  C=1, loss=squared_hinge, multi_class=ovr, score=0.784, total=  12.0s
[CV] C=1, loss=squared_hinge, multi_class=ovr ........................
[CV]  C=1, loss=squared_hinge, multi_class=ovr, score=0.777, total=  12.0s
[CV] C=1, loss=squared_hinge, multi_class=crammer_singer .............
[CV]  C=1, loss=squared_hinge, multi_class=crammer_singer, score=0.789, total=  50.5s
[CV] C=1, loss=squared_hinge, multi_class=crammer_singer .............
[CV]  C=1, loss=squared_hinge, multi_class=crammer_singer, score=0.783, total= 1.2min
[CV] C=2, loss=hinge, multi_class=ovr ................................
[CV] .... C=2, loss=hinge, multi_class=ovr, score=0.788, total=  15.6s
[CV] C=2, loss=hinge, mul

[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed: 11.0min finished


Time taken to execute Grid search for Support Vector Classification! 897


In [38]:
print("Best Score:", grid_search_lsvc_reut.best_score_)
print()
print("Best Parameters", grid_search_lsvc_reut.best_params_)

Best Score: 0.7860549818442488

Best Parameters {'C': 1, 'loss': 'hinge', 'multi_class': 'crammer_singer'}


In [39]:
grid_search_lsvc_reut_y_pred = grid_search_lsvc_reut.predict(reut_x_test)
print('SCORES WITH RANDOM FOREST CLASSIFIER AFTER GRID SEARCH')
print('\n\n')
print(classification_report(reut_y_test, grid_search_lsvc_reut_y_pred, zero_division='warn'))
print('\n\n')
print("Using random forest classifier yieled an accuracy of --->", grid_search_lsvc_reut.score(reut_x_test, reut_y_test))
print("Using random forest classifier a micro f1-score of --->", f1_score(reut_y_test, grid_search_lsvc_reut_y_pred, average='micro'))
print("Using random forest classifier a macro f1-score of --->", f1_score(reut_y_test, grid_search_lsvc_reut_y_pred, average='macro'))

SCORES WITH RANDOM FOREST CLASSIFIER AFTER GRID SEARCH



              precision    recall  f1-score   support

           0       0.84      0.88      0.86       884
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         0
           7       0.88      0.58      0.70        12
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         1
          13       1.00      1.00      1.00         1
          14       0.00      0.00      0.00         1
          15       0.78      0.87      0.82        60
          16       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         0
          21       0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Usireut multinomial naive bayes yieled an accuracy of ---> 0.8171605111641623
Usireut multinomial naive bayes a micro f1-score of ---> 0.8171605111641623
Usireut multinomial naive bayes a macro f1-score of ---> 0.15974942196021472


# Design Choices and Qualitative Analysis 

The two datasets used for this machine learning model were parsed quite differently. The newsgroup dataset was easily parsed through sklearn's method whereas the Reuters required the use of regular expressions and beautiful soup.

The Random Forest Classifieri was utilized as a probabilistic model whilst the Linear Support Vector Classifier was used as a non-probablistic model, for both datasets. Given the larger size of the Reuters dataset, stopwords and the n-gram were removed from the encoding when working with said dataset. This decision was taken after noticing the computational time needed to grid search for the best hyperparameters of the model. On the other hand, stopwords, n-grams, idf and the norm parameter were implemented on the encoding for the newsgroup dataset. From this point forward, both of these structures will be referred to as 'base model'. Furhermore, the maximum depth of the tree for the RFC was set to 10 to reduce the computational time needed to fit the model. In any scenario, expanding the nodes until all leaves are pure would yield the best possible result however this needed to be sacrified to balance out the time needed.

It is worth noting that when working with the Reuters dataset, the train/test split ration of 33/66 was chosen to avoid getting such errors in the metrics: 'ValueError: The least populated class in y has only 1 member, which is too few'. This suggestion was taken from the following github page: https://github.com/davidsbatista/text-classification/issues/1. 

The base model of the Random Forest Classifier yieled an accuracy of 51% for both datasets. The hyperparameter grid search improved the accuracy for the Newsgroup (NG)dataset by 17% and for the Reuters dataset by 13%. The base model of the LinearSVC yieled an accuracy of 77% and of 81% for the NG dataset and the Reuters dataset respectively. The gridsearch had no signifcant on the first dataset (an increase of 0.004%) whereas in increased the accuracy by 4% for the Reuters Dataset.

After comparing the SVC and LinearSVC on the newsgroup dataset (on a different notebook), the latter was chosen given how slow the former is train on such large datasets. However, using the the SVC with a non-linear kernel, intuitively and based upon my limited general knowledge of the machine learning, would have yieled better results due to the nature of the classification problem. The non-linear kernel function would generally yield better classification when a linear seperation cannot be found. 

Encoding and tokenzing (the reuters text) in a more detailed and efficient manner would have been another possible way to improve the results. Any possible comments/feedback on such matter would be greatly appreciated.



# General Comments

All in all, this assignment served well to experiment with different types of encoding and models. Personally, I would really like to dig deeper into what types of pre-processing, encoding and models would work best with specific datasets. I think being adapative in the field of a machine learning is one important skill to have, one that I really hope to acquire in the future. 