In [15]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [16]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [17]:
tfidf_Vect = TfidfVectorizer()
X_train_tfidf = tfidf_Vect.fit_transform(twenty_train.data)

clf = MultinomialNB()
clf.fit(X_train_tfidf, twenty_train.target)

MultinomialNB()

In [18]:
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

In [19]:
X_test_tfidf = tfidf_Vect.transform(twenty_test.data)

In [20]:
predicted = clf.predict(X_test_tfidf)

In [21]:
score = metrics.accuracy_score(twenty_test.target, predicted)
print(score)

0.7738980350504514


Martin Yap
Hoyun Yoon
ICP 7

In [22]:
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

In [23]:
vec = svm.SVC()
knn = KNeighborsClassifier(n_neighbors=5)

a) Apply the SVM algorithm to the data

In [24]:
# Fit the training data to the SVM model
vec.fit(X_train_tfidf, twenty_train.target)

SVC()

In [None]:
# Use the fitted svm model to get the predictions for the test dataset
vec_pred = vec.predict(X_test_tfidf)

In [None]:
# Calculate the accuracy score for the SVM model
vec_score = metrics.accuracy_score(twenty_test.target, vec_pred)
print(vec_score)

b) Apply KNN classifier to the data

In [None]:
# Fit the training data to the KNN model
knn.fit(X_train_tfidf, twenty_train.target)

In [None]:
# Use the knn fitted model to get the predictions for the test data
knn_pred = knn.predict(X_test_tfidf)

In [None]:
# Calculate the accuracy score for the KNN model
knn_score = metrics.accuracy_score(twenty_test.target, knn_pred)
print(knn_score)

c) Print classification report for each classifier

In [21]:
# Classification report for the multinomial naive bayes classifier
print(metrics.classification_report(twenty_test.target, predicted))

              precision    recall  f1-score   support

           0       0.80      0.52      0.63       319
           1       0.81      0.65      0.72       389
           2       0.82      0.65      0.73       394
           3       0.67      0.78      0.72       392
           4       0.86      0.77      0.81       385
           5       0.89      0.75      0.82       395
           6       0.93      0.69      0.80       390
           7       0.85      0.92      0.88       396
           8       0.94      0.93      0.93       398
           9       0.92      0.90      0.91       397
          10       0.89      0.97      0.93       399
          11       0.59      0.97      0.74       396
          12       0.84      0.60      0.70       393
          13       0.92      0.74      0.82       396
          14       0.84      0.89      0.87       394
          15       0.44      0.98      0.61       398
          16       0.64      0.94      0.76       364
          17       0.93    

In [22]:
# Classification report for the SVM classifier
print(metrics.classification_report(twenty_test.target, vec_pred))

              precision    recall  f1-score   support

           0       0.83      0.71      0.76       319
           1       0.62      0.82      0.71       389
           2       0.80      0.70      0.75       394
           3       0.73      0.78      0.76       392
           4       0.82      0.83      0.82       385
           5       0.83      0.73      0.77       395
           6       0.73      0.91      0.81       390
           7       0.90      0.87      0.89       396
           8       0.96      0.93      0.95       398
           9       0.88      0.91      0.90       397
          10       0.97      0.91      0.94       399
          11       0.96      0.85      0.90       396
          12       0.65      0.85      0.74       393
          13       0.88      0.78      0.82       396
          14       0.93      0.88      0.90       394
          15       0.79      0.92      0.85       398
          16       0.75      0.88      0.81       364
          17       0.97    

In [23]:
# Classification report for the KNN classifier
print(metrics.classification_report(twenty_test.target, knn_pred))

              precision    recall  f1-score   support

           0       0.43      0.76      0.55       319
           1       0.50      0.61      0.55       389
           2       0.56      0.57      0.57       394
           3       0.53      0.58      0.56       392
           4       0.59      0.56      0.57       385
           5       0.69      0.60      0.64       395
           6       0.58      0.45      0.51       390
           7       0.75      0.69      0.72       396
           8       0.84      0.81      0.82       398
           9       0.77      0.72      0.74       397
          10       0.85      0.84      0.84       399
          11       0.76      0.84      0.80       396
          12       0.70      0.50      0.58       393
          13       0.82      0.49      0.62       396
          14       0.79      0.76      0.78       394
          15       0.75      0.76      0.76       398
          16       0.70      0.73      0.72       364
          17       0.62    

Comparing all the classifiers, we can see that the SVM model has the highest accuracy with 0.82. The Multinomial Naive Bayes classifier is next with 0.77 and is then followed by the KNN classifier with an accuracy of 0.66. The accuracy of the KNN model can be improved if we could find a good k value to use for the number of clusters.  

Looking more closely at the classification reports of the classifiers, we can see that the KNN model had low precision and recall, resulting in a low F1 score. The SVM model had relatively high precision and recall, with a few low recall scores. In the Multinomial Naive Bayes model, it had relatively high precision, however, its recall scores were lower in comparison to those seen in the SVM model. These scores caused the F1 scores to be lower for some of the elements. 

d) Use bigram with the tfidf vectorizer

In [10]:
# Set ngram range to use only and create the new training and testing datasets
bi_tfidf_Vect = TfidfVectorizer(ngram_range = (2, 2))
X_train_bi_tfidf = bi_tfidf_Vect.fit_transform(twenty_train.data)
X_test_bi_tfidf = bi_tfidf_Vect.transform(twenty_test.data)

Use the Multinomial Naive Bayes model with the data

In [11]:
# Fit the new training data to the model
clf.fit(X_train_bi_tfidf, twenty_train.target)

MultinomialNB()

In [12]:
# Find the predictions made by the Multinomial model
bi_predicted = clf.predict(X_test_bi_tfidf)

In [13]:
# Calculate how accurate the Multinomial model is
bi_score = metrics.accuracy_score(twenty_test.target, bi_predicted)
print(bi_score)

0.7327403080191184


e) Use stop words with the tfidf vectorizer

In [6]:
# Set stop words to english and create the new training and testing datasets
stop_tfidf_Vect = TfidfVectorizer(stop_words='english')
X_train_stop_tfidf = stop_tfidf_Vect.fit_transform(twenty_train.data)
X_test_stop_tfidf = stop_tfidf_Vect.transform(twenty_test.data)

Applying these datasets to the Multinomial Naive Bayes classifier

In [7]:
# Fit the new training data to the model
clf.fit(X_train_stop_tfidf, twenty_train.target)

MultinomialNB()

In [8]:
# Find the predictions made by the Multinomial model
stop_predicted = clf.predict(X_test_stop_tfidf)

In [9]:
# Calculate how accurate the Multinomial model is
stop_score = metrics.accuracy_score(twenty_test.target, stop_predicted)
print(stop_score)

0.8169144981412639


Comparing the two accuracy scores obtained in parts d and e with the original score found by the multinomial naive bayes classifier, using the stop words produced a higher accuracy score and using the bigrams produced a lower accuracy score. Based on these results, we can improve the classifier performance by removing the stop words from datasets. 