# Question 8
Multiclass Classification

In [1]:
import numpy as np
np.random.seed(42)
import random
random.seed(42)
from sklearn import datasets
import matplotlib.pyplot as plt
from nltk import WordNetLemmatizer
from nltk import pos_tag
import nltk
from nltk.corpus import wordnet
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#implement new dataset
categories_computer_sys_ibm=['comp.sys.ibm.pc.hardware']
categories_computer_sys_mac=['comp.sys.mac.hardware']
categories_misc_forsale=['misc.forsale']
categories_soc_re_chris=['soc.religion.christian']
computer_sys_ibm_train=datasets.fetch_20newsgroups(subset = 'train', categories = categories_computer_sys_ibm, shuffle = True, random_state = None).data
computer_sys_ibm_test=datasets.fetch_20newsgroups(subset = 'test', categories = categories_computer_sys_ibm, shuffle = True, random_state = None).data
computer_sys_mac_train=datasets.fetch_20newsgroups(subset = 'train', categories = categories_computer_sys_mac, shuffle = True, random_state = None).data
computer_sys_mac_test=datasets.fetch_20newsgroups(subset = 'test', categories = categories_computer_sys_mac, shuffle = True, random_state = None).data
misc_forsale_train=datasets.fetch_20newsgroups(subset = 'train', categories = categories_misc_forsale, shuffle = True, random_state = None).data
misc_forsale_test=datasets.fetch_20newsgroups(subset = 'test', categories = categories_misc_forsale, shuffle = True, random_state = None).data
soc_re_chris_train=datasets.fetch_20newsgroups(subset = 'train', categories = categories_soc_re_chris, shuffle = True, random_state = None).data
soc_re_chris_test=datasets.fetch_20newsgroups(subset = 'test', categories = categories_soc_re_chris, shuffle = True, random_state = None).data

In [3]:
X_train=computer_sys_ibm_train+computer_sys_mac_train+misc_forsale_train+soc_re_chris_train
X_test=computer_sys_ibm_test+computer_sys_mac_test+misc_forsale_test+soc_re_chris_test
X_overall=X_train+X_test
Y_train=[0]*len(computer_sys_ibm_train)+[1]*len(computer_sys_mac_train)+[2]*len(misc_forsale_train)+[3]*len(soc_re_chris_train)
Y_test=[0]*len(computer_sys_ibm_test)+[1]*len(computer_sys_mac_test)+[2]*len(misc_forsale_test)+[3]*len(soc_re_chris_test)
print(np.shape(X_overall))

(3917,)


In [4]:
#Lemmatization
lemmatizer=WordNetLemmatizer()
#Define lemmatizer pos_tag to deal with adj, verb, noun and adv separately
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

#Preprocessing: exclude some symbols and numbers in each sentence
exclude_sign="~#$%^&*(){}[]<>|+=1234567890"
replace_sign="@,.?!-;"
def preprocessing(data):
    processed_sentence=[]
    for i in range(len(data)):
        sentence=data[i]
        for c in exclude_sign:
            sentence=sentence.replace(c,"")
        for c in replace_sign:
            sentence=sentence.replace(c," ")
        processed_token=[lemmatizer.lemmatize(w,get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]
        processed_sentence.append(" ".join(processed_token))
    return processed_sentence

#operation of lemmaization
processed_data=preprocessing(X_overall)


In [5]:
#Vectorization
vectorizer=CountVectorizer(stop_words='english',min_df=3)
data_vec=vectorizer.fit_transform(processed_data)

#TdIdf
from sklearn.feature_extraction.text import TfidfTransformer
TdT=TfidfTransformer()
data_vec_ti=TdT.fit_transform(data_vec)

#LSI
from sklearn.decomposition import TruncatedSVD
transformer=TruncatedSVD(n_components=50)
data_svd=transformer.fit_transform(data_vec_ti)
print(np.shape(data_svd))

#Divide train rows and test rows
X_train_tf=data_svd[0:len(X_train)]
print(np.shape(X_train_tf))
X_test_tf=data_svd[len(X_train):]
print(np.shape(X_test_tf))




(3917, 50)
(2352, 50)
(1565, 50)


# Naive Bayes (GaussianNB)

### One vs One Classifier

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.multiclass import OneVsOneClassifier
def GaussiainNB_ovo (X_train,Y_train, X_test, Y_test):
    ono_classifier=OneVsOneClassifier(GaussianNB())
    ono_classifier.fit(X_train, Y_train)
    predict_target=ono_classifier.predict(X_test)
    confusion_matrix=metrics.confusion_matrix(Y_test, predict_target)
    accuracy=metrics.accuracy_score(Y_test, predict_target)
    recall=metrics.recall_score(Y_test, predict_target,average = None)
    precision=metrics.precision_score(Y_test, predict_target,average = None)
    f1_score=metrics.f1_score(Y_test, predict_target,average = None)
    print('Naive Bayes---One vs One \n')
    print(confusion_matrix)
    print("accuracy: {}\nrecall: {}\nprecision: {}\nf1: {}".format(accuracy, recall, precision, f1_score))


In [7]:
GaussiainNB_ovo(X_train_tf,Y_train, X_test_tf, Y_test)

Naive Bayes---One vs One 

[[247  58  70  17]
 [ 72 248  57   8]
 [ 56  40 282  12]
 [  2   2   3 391]]
accuracy: 0.7463258785942491
recall: [0.63010204 0.64415584 0.72307692 0.98241206]
precision: [0.65517241 0.71264368 0.68446602 0.9135514 ]
f1: [0.64239272 0.67667121 0.7032419  0.94673123]


### One vs Rest Classifier

In [8]:
from sklearn.multiclass import OneVsRestClassifier
def GaussiainNB_ovr (X_train,Y_train, X_test, Y_test):
    onr_classifier=OneVsRestClassifier(GaussianNB())
    onr_classifier.fit(X_train, Y_train)
    predict_target=onr_classifier.predict(X_test)
    confusion_matrix=metrics.confusion_matrix(Y_test, predict_target)
    accuracy=metrics.accuracy_score(Y_test, predict_target)
    recall=metrics.recall_score(Y_test, predict_target,average = None)
    precision=metrics.precision_score(Y_test, predict_target,average = None)
    f1_score=metrics.f1_score(Y_test, predict_target,average = None)
    print('Naive Bayes---One vs Rest \n')
    print(confusion_matrix)
    print("accuracy: {}\nrecall: {}\nprecision: {}\nf1: {}".format(accuracy, recall, precision, f1_score))


In [9]:
GaussiainNB_ovr(X_train_tf,Y_train, X_test_tf, Y_test)

Naive Bayes---One vs Rest 

[[246  62  74  10]
 [ 66 250  62   7]
 [ 55  40 285  10]
 [  0   2   5 391]]
accuracy: 0.7488817891373802
recall: [0.62755102 0.64935065 0.73076923 0.98241206]
precision: [0.67029973 0.70621469 0.66901408 0.9354067 ]
f1: [0.64822134 0.67658999 0.69852941 0.95833333]


In [24]:
#predict_target= naive_Gaussian_model(X_train_tf, Y_train, X_test_tf, Y_test)
#evaluation(predict_target, Y_test)

Confusion matrix is: 
[[247  58  70  17]
 [ 72 248  57   8]
 [ 56  40 282  12]
 [  2   2   3 391]]
Accuracy: 0.7463258785942491
Recall: [0.63010204 0.64415584 0.72307692 0.98241206]
Precision: [0.65517241 0.71264368 0.68446602 0.9135514 ]
f1: [0.64239272 0.67667121 0.7032419  0.94673123]


# Multiclass SVM

### One vs One Classifier

In [12]:
from sklearn import svm
def SVM_ovo (X_train,Y_train, X_test, Y_test):
    ono_classifier=OneVsOneClassifier(svm.SVC(C=1000,gamma='auto'))
    ono_classifier.fit(X_train, Y_train)
    predict_target=ono_classifier.predict(X_test)
    confusion_matrix=metrics.confusion_matrix(Y_test, predict_target)
    accuracy=metrics.accuracy_score(Y_test, predict_target)
    recall=metrics.recall_score(Y_test, predict_target,average = None)
    precision=metrics.precision_score(Y_test, predict_target,average = None)
    f1_score=metrics.f1_score(Y_test, predict_target,average = None)
    print('SVM---One vs One \n')
    print(confusion_matrix)
    print("accuracy: {}\nrecall: {}\nprecision: {}\nf1: {}".format(accuracy, recall, precision, f1_score))

In [13]:
SVM_ovo(X_train_tf,Y_train, X_test_tf, Y_test)

SVM---One vs One 

[[331  35  26   0]
 [ 33 331  21   0]
 [ 30  21 336   3]
 [  3   0   2 393]]
accuracy: 0.888817891373802
recall: [0.84438776 0.85974026 0.86153846 0.98743719]
precision: [0.83375315 0.85529716 0.87272727 0.99242424]
f1: [0.83903676 0.85751295 0.86709677 0.98992443]


### One vs Rest Classifier

In [18]:
def SVM_ovr (X_train,Y_train, X_test, Y_test):
    onr_classifier=OneVsRestClassifier(svm.SVC(C=1000,gamma='auto'))
    onr_classifier.fit(X_train, Y_train)
    predict_target=onr_classifier.predict(X_test)
    confusion_matrix=metrics.confusion_matrix(Y_test, predict_target)
    accuracy=metrics.accuracy_score(Y_test, predict_target)
    recall=metrics.recall_score(Y_test, predict_target,average = None)
    precision=metrics.precision_score(Y_test, predict_target,average = None)
    f1_score=metrics.f1_score(Y_test, predict_target,average = None)
    print('SVM---One vs Rest \n')
    print(confusion_matrix)
    print("accuracy: {}\nrecall: {}\nprecision: {}\nf1: {}".format(accuracy, recall, precision, f1_score))

In [19]:
SVM_ovr(X_train_tf,Y_train, X_test_tf, Y_test)

SVM---One vs Rest 

[[333  37  20   2]
 [ 36 329  20   0]
 [ 26  23 337   4]
 [  5   1   0 392]]
accuracy: 0.888817891373802
recall: [0.8494898  0.85454545 0.86410256 0.98492462]
precision: [0.8325     0.84358974 0.8938992  0.98492462]
f1: [0.84090909 0.84903226 0.87874837 0.98492462]
