In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.stem import PorterStemmer # for stemming
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import gensim
from gensim.models import Word2Vec
import itertools

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Getting Data

In [None]:
!unzip /content/bbc-fulltext.zip -d /content/

In [None]:
DATA_LOC = "/content/bbc"

bbc_data = load_files(DATA_LOC, encoding="utf-8", decode_error="replace")

In [None]:
bbc_data.target

array([0, 4, 2, ..., 1, 1, 3])

In [None]:
bbc_data.target_names

['business', 'entertainment', 'politics', 'sport', 'tech']

In [None]:
len(bbc_data.target)

2225

In [None]:
# getting count of each news category

labels, counts = np.unique(bbc_data.target, return_counts=True)
dict(zip(labels, counts))

{0: 510, 1: 386, 2: 417, 3: 511, 4: 401}

In [None]:
labels

array([0, 1, 2, 3, 4])

In [None]:
# just converting target_names to array for representing labels with names

labels_name = np.array(bbc_data.target_names)[labels]
dict(zip(labels_name, counts))

{'business': 510,
 'entertainment': 386,
 'politics': 417,
 'sport': 511,
 'tech': 401}

In [None]:
df = pd.DataFrame(list(zip(bbc_data.target, bbc_data.data)), columns =['Target', 'Data'])
df.head()

Unnamed: 0,Target,Data
0,0,Tate & Lyle boss bags top award\n\nTate & Lyle...
1,4,Halo 2 sells five million copies\n\nMicrosoft ...
2,2,MSPs hear renewed climate warning\n\nClimate c...
3,3,Pavey focuses on indoor success\n\nJo Pavey wi...
4,2,Tories reject rethink on axed MP\n\nSacked MP ...


In [None]:
def clean(review):
    review = re.sub('[^a-zA-Z]', ' ', review)       # removing everything other than alphabets
    review = review.lower()
    return review

In [None]:
df['Cleaned'] = pd.DataFrame(df['Data'].apply(lambda x: clean(x)))
df.head()

Unnamed: 0,Target,Data,Cleaned
0,0,Tate & Lyle boss bags top award\n\nTate & Lyle...,tate lyle boss bags top award tate lyle s...
1,4,Halo 2 sells five million copies\n\nMicrosoft ...,halo sells five million copies microsoft is...
2,2,MSPs hear renewed climate warning\n\nClimate c...,msps hear renewed climate warning climate cha...
3,3,Pavey focuses on indoor success\n\nJo Pavey wi...,pavey focuses on indoor success jo pavey will...
4,2,Tories reject rethink on axed MP\n\nSacked MP ...,tories reject rethink on axed mp sacked mp ho...


In [None]:
df['Stemmed'] = ''

In [None]:
# stemming

stemmer = PorterStemmer()

for i in range(len(df['Cleaned'])):
    words = nltk.word_tokenize(df['Cleaned'][i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    df['Stemmed'][i] = ' '.join(words)

In [None]:
df.head()

Unnamed: 0,Target,Data,Cleaned,Stemmed
0,0,Tate & Lyle boss bags top award\n\nTate & Lyle...,tate lyle boss bags top award tate lyle s...,tate lyle boss bag top award tate lyle chief e...
1,4,Halo 2 sells five million copies\n\nMicrosoft ...,halo sells five million copies microsoft is...,halo sell five million copi microsoft celebr b...
2,2,MSPs hear renewed climate warning\n\nClimate c...,msps hear renewed climate warning climate cha...,msp hear renew climat warn climat chang could ...
3,3,Pavey focuses on indoor success\n\nJo Pavey wi...,pavey focuses on indoor success jo pavey will...,pavey focus indoor success jo pavey miss janua...
4,2,Tories reject rethink on axed MP\n\nSacked MP ...,tories reject rethink on axed mp sacked mp ho...,tori reject rethink axe mp sack mp howard flig...


## Using LSA

In [None]:
# creating the Bag of Words model

cv = CountVectorizer()
X = cv.fit_transform(df['Stemmed']).toarray()         # sparse matrix

In [None]:
len(df['Stemmed'][1])

1178

In [None]:
X[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
len(X[0])

18979

In [None]:
X = X
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state=10)

len(X_train), len(y_train), len(X_test), len(y_test)

(1557, 1557, 668, 668)

In [None]:
# for Latent Semantic Analysis

n_topics = 5        # keeping num of topics same as given classes

svd_model = TruncatedSVD(n_components = n_topics, algorithm = 'randomized', n_iter=100, random_state = 10)
X_train = svd_model.fit_transform(X_train)
X_test = svd_model.transform(X_test)

In [None]:
X_train

array([[ 5.34119376, -3.06299749,  1.05704974,  1.53824731, -0.8738024 ],
       [ 8.60648667, -2.70881562,  0.40078339, -4.15775011,  0.22156914],
       [11.78379459, -7.93197316,  2.66187969,  3.71396853, -3.42767301],
       ...,
       [ 5.46836396,  0.58466565,  0.14409532, -0.12750989,  1.08462742],
       [ 6.11838976, -3.00891059,  1.40991213,  2.1984599 ,  0.24034305],
       [13.22369161, -9.18999257,  4.4685587 ,  6.6221757 ,  3.45806217]])

In [None]:
svd_model.components_

array([[ 1.05876714e-04,  1.47205138e-03,  2.12754577e-04, ...,
         0.00000000e+00,  1.02515321e-04,  5.23993662e-05],
       [-6.78818173e-05,  3.17840964e-03,  3.54923085e-04, ...,
         0.00000000e+00,  2.43454996e-04,  1.96999849e-04],
       [-1.98280493e-05,  1.25289525e-03, -9.67741708e-05, ...,
         0.00000000e+00,  3.20066902e-05,  1.68647582e-04],
       [-1.03440820e-04,  6.26891233e-04, -5.30422608e-05, ...,
        -0.00000000e+00,  4.29673736e-05, -3.59820177e-05],
       [ 3.27368341e-05, -1.02254181e-03, -2.67708449e-04, ...,
         0.00000000e+00, -1.09872090e-06, -1.81714264e-04]])

In [None]:
# displaying words under each of the 5 topics

terms = cv.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:4]       # showing just 4 words per topic
    print()
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])


Topic 0: 
said
mr
year
would

Topic 1: 
game
best
song
year

Topic 2: 
song
best
year
mr

Topic 3: 
game
mr
play
parti

Topic 4: 
wage
minimum
increas
pay


In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [None]:
y_test

845     3
887     2
1621    3
1912    1
1672    0
       ..
997     1
572     0
1487    0
948     1
370     0
Name: Target, Length: 668, dtype: int64

In [None]:
y_pred

array([3, 0, 3, 3, 0, 4, 2, 0, 1, 4, 3, 1, 2, 0, 1, 3, 2, 0, 3, 4, 1, 4,
       1, 0, 3, 0, 0, 0, 3, 2, 1, 2, 4, 0, 3, 2, 4, 2, 4, 4, 3, 4, 3, 3,
       4, 3, 4, 0, 1, 0, 1, 0, 3, 3, 1, 3, 0, 2, 0, 2, 0, 2, 4, 2, 2, 0,
       3, 0, 2, 4, 0, 0, 0, 1, 0, 4, 3, 3, 1, 0, 3, 1, 0, 0, 1, 0, 3, 2,
       3, 1, 3, 3, 0, 1, 3, 4, 2, 3, 0, 1, 3, 4, 4, 0, 0, 3, 4, 3, 2, 3,
       1, 1, 2, 0, 4, 2, 2, 2, 2, 3, 1, 0, 0, 4, 3, 0, 1, 2, 2, 2, 1, 3,
       3, 4, 0, 2, 2, 0, 4, 2, 2, 2, 0, 3, 4, 4, 3, 3, 3, 1, 0, 0, 0, 3,
       4, 0, 0, 3, 3, 2, 0, 1, 2, 4, 4, 0, 4, 1, 0, 0, 3, 3, 1, 3, 3, 2,
       2, 3, 4, 2, 3, 3, 2, 2, 1, 3, 2, 4, 4, 1, 3, 0, 3, 2, 4, 0, 3, 0,
       3, 2, 0, 4, 0, 4, 2, 0, 4, 0, 1, 4, 2, 3, 2, 3, 1, 1, 2, 3, 1, 4,
       3, 1, 2, 4, 4, 2, 3, 1, 4, 3, 3, 1, 3, 3, 2, 4, 2, 0, 3, 3, 0, 4,
       0, 4, 4, 3, 3, 0, 0, 4, 0, 3, 2, 1, 0, 2, 2, 4, 0, 0, 3, 4, 4, 1,
       0, 1, 4, 4, 4, 3, 0, 4, 0, 1, 4, 1, 1, 1, 0, 0, 0, 1, 0, 4, 2, 0,
       1, 0, 2, 1, 0, 4, 1, 4, 4, 1, 2, 1, 4, 1, 2,

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[128   2  14   3   6]
 [  0  96   2  15   3]
 [ 21   0  96   8   0]
 [  1   7   2 140   4]
 [  6   2   5   6 101]]


In [None]:
print('Accuracy: ', round(accuracy_score(y_test, y_pred)*100,2),'%')

Accuracy:  83.98 %


## Using Word2Vec

In [None]:
df.head()

Unnamed: 0,Target,Data,Cleaned,Stemmed
0,0,Tate & Lyle boss bags top award\n\nTate & Lyle...,tate lyle boss bags top award tate lyle s...,tate lyle boss bag top award tate lyle chief e...
1,4,Halo 2 sells five million copies\n\nMicrosoft ...,halo sells five million copies microsoft is...,halo sell five million copi microsoft celebr b...
2,2,MSPs hear renewed climate warning\n\nClimate c...,msps hear renewed climate warning climate cha...,msp hear renew climat warn climat chang could ...
3,3,Pavey focuses on indoor success\n\nJo Pavey wi...,pavey focuses on indoor success jo pavey will...,pavey focus indoor success jo pavey miss janua...
4,2,Tories reject rethink on axed MP\n\nSacked MP ...,tories reject rethink on axed mp sacked mp ho...,tori reject rethink axe mp sack mp howard flig...


In [None]:
df['Tokenized'] = df['Stemmed'].apply(nltk.word_tokenize)     # generating tokens
df.head()

Unnamed: 0,Target,Data,Cleaned,Stemmed,Tokenized
0,0,Tate & Lyle boss bags top award\n\nTate & Lyle...,tate lyle boss bags top award tate lyle s...,tate lyle boss bag top award tate lyle chief e...,"[tate, lyle, boss, bag, top, award, tate, lyle..."
1,4,Halo 2 sells five million copies\n\nMicrosoft ...,halo sells five million copies microsoft is...,halo sell five million copi microsoft celebr b...,"[halo, sell, five, million, copi, microsoft, c..."
2,2,MSPs hear renewed climate warning\n\nClimate c...,msps hear renewed climate warning climate cha...,msp hear renew climat warn climat chang could ...,"[msp, hear, renew, climat, warn, climat, chang..."
3,3,Pavey focuses on indoor success\n\nJo Pavey wi...,pavey focuses on indoor success jo pavey will...,pavey focus indoor success jo pavey miss janua...,"[pavey, focus, indoor, success, jo, pavey, mis..."
4,2,Tories reject rethink on axed MP\n\nSacked MP ...,tories reject rethink on axed mp sacked mp ho...,tori reject rethink axe mp sack mp howard flig...,"[tori, reject, rethink, axe, mp, sack, mp, how..."


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Tokenized'], df['Target'], test_size=0.3, shuffle=True, stratify=df['Target'], random_state=10)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1557,), (668,), (1557,), (668,))

In [None]:
X_train

1188    [brown, visit, slum, africa, trip, chancellor,...
2081    [yuko, lose, us, bankruptci, battl, judg, dism...
1214    [campbel, e, mail, row, silli, fuss, ex, media...
917     [secur, paper, found, street, inquiri, way, fi...
1944    [ultra, fast, wi, fi, near, complet, ultra, hi...
                              ...                        
2165    [kennedi, look, elect, gain, may, know, quit, ...
524     [dvd, copi, protect, strengthen, dvd, harder, ...
2043    [japanes, mogul, arrest, fraud, one, japan, be...
1467    [kennedi, make, templ, address, charl, kennedi...
704     [tori, tax, cut, lift, spirit, michael, howard...
Name: Tokenized, Length: 1557, dtype: object

In [None]:
# min_count = 1 -> taking words which have atleast 1 apperance
# size = 500 -> each word will have vector of 500 dimensions
# window = 5 -> max distance between current and predicted word within a sentence
# sg = 0 -> 0 means using CBOW method (0 is default value so not kept in param)

wv_model = Word2Vec(df['Tokenized'], min_count = 1, size = 500, window = 5)

In [None]:
print(wv_model)       # word2vec model

Word2Vec(vocab=18997, size=500, alpha=0.025)


In [None]:
words = list(wv_model.wv.vocab)     # showing all words in model
print(words)

['tate', 'lyle', 'boss', 'bag', 'top', 'award', 'chief', 'execut', 'name', 'european', 'businessman', 'year', 'lead', 'busi', 'magazin', 'iain', 'ferguson', 'titl', 'us', 'public', 'forb', 'return', 'one', 'uk', 'vener', 'manufactur', 'countri', 'compani', 'sugar', 'group', 'absent', 'ftse', 'seven', 'mr', 'help', 'growth', 'share', 'leapt', 'boost', 'firm', 'price', 'sale', 'artifici', 'sweeten', 'sag', 'stock', 'hiatu', 'britain', 'vaunt', 'index', 'said', 'took', 'helm', 'spend', 'career', 'consum', 'good', 'giant', 'unilev', 'origin', 'member', 'histor', 'ft', 'oper', 'factori', 'addit', 'product', 'facil', 'previou', 'winner', 'includ', 'royal', 'bank', 'scotland', 'fred', 'goodwin', 'former', 'vodafon', 'chri', 'gent', 'halo', 'sell', 'five', 'million', 'copi', 'microsoft', 'celebr', 'bumper', 'xbox', 'sci', 'fi', 'shooter', 'game', 'sold', 'worldwid', 'sinc', 'went', 'mid', 'novemb', 'prove', 'popular', 'onlin', 'gamer', 'notch', 'record', 'hour', 'play', 'live', 'accord', 'nine

In [None]:
print(wv_model['award'])            # showing vector result of 'award'

[ 0.07503908 -0.32147467 -0.21514909  0.31517604 -0.21327786  0.0505368
  0.06681369 -0.03563528  0.88697267  0.13598992  0.5502062  -0.34088048
  0.01955201  0.3254728   0.20980768 -0.30755532  0.1071178   0.9841755
 -0.19690432  0.24618976  0.5871445  -0.13746668 -0.286835    0.44310343
  0.22580999  0.39978978  0.6766553  -0.13824409  0.5631352   0.4170839
  0.5328391   0.45542753  0.28664547 -0.32548437  0.02594325  0.08204161
  0.63135135  0.19376266 -0.3785987  -0.28269318  0.29659927 -0.22493699
 -0.01731181 -0.10122898  0.38008156 -0.30703422 -0.4660031  -0.196941
  0.0736649   0.20987345  0.2232421   0.13540857  0.6600943  -0.20682253
 -0.07864965 -0.3474183  -0.68735    -0.22489533  0.2796611  -0.41037703
  0.36923993 -0.36851984  0.42692536  0.2604507  -0.21524638  0.3341367
 -0.61734545  0.40379316 -0.1160107   0.27691662 -0.71064055  0.45351267
  0.3055435  -0.2674344  -1.042059    0.22741526  0.55077577  0.14797533
  0.37382826  0.0510541  -0.3991457  -0.2646165  -0.55810

In [None]:
np.mean(wv_model['award'])      # calculating mean for the vectors of 'award' - this concept will be used for training model

0.018519623

In [None]:
wv_model.most_similar('supermarket')      # finding most similar words to 'supermarket'

[('driver', 0.9978682994842529),
 ('vehicl', 0.9977596402168274),
 ('plung', 0.9976966381072998),
 ('mainli', 0.997539758682251),
 ('appetit', 0.9974253177642822),
 ('cement', 0.9972644448280334),
 ('region', 0.9972152709960938),
 ('bubbl', 0.9970439672470093),
 ('petrochem', 0.996979832649231),
 ('arabia', 0.9969272613525391)]

In [None]:
wv_model.similar_by_word('award')

[('guild', 0.9899619817733765),
 ('oscar', 0.9866883158683777),
 ('nomin', 0.9856992959976196),
 ('star', 0.9826651811599731),
 ('serna', 0.9819879531860352),
 ('best', 0.9801581501960754),
 ('theatrego', 0.9767152070999146),
 ('actor', 0.9754496812820435),
 ('patsi', 0.9746658802032471),
 ('prize', 0.9722363948822021)]

In [None]:
wv_model.similarity('supermarket','award')      # finding similarity between 'award' and 'supermarket'

0.56501424

In [None]:
len(wv_model.wv.index2word)

18997

In [None]:
wv_model.wv.index2word[0:5]         # contains all words whose embedding has been done

['said', 'year', 'mr', 'would', 'also']

In [None]:
wv_model.wv.syn0[0:5]               # contains embedded values for each words 

array([[ 0.07112597,  0.4339686 , -0.26686496, ...,  0.06399513,
         0.12586893,  0.19365853],
       [-0.23375562, -0.45294818, -0.17165804, ..., -0.02260541,
        -0.29067567,  0.9794277 ],
       [ 0.4947634 ,  0.56376994, -0.14264657, ...,  0.06022156,
         0.2220228 ,  0.30649102],
       [-0.06140633,  0.4722354 , -0.00916265, ...,  0.01068647,
        -0.05040403,  0.22962937],
       [ 0.01256272,  0.26805693, -0.2265621 , ...,  0.11942334,
         0.01239043,  0.17052233]], dtype=float32)

In [None]:
vectorized = dict(zip(wv_model.wv.index2word, wv_model.wv.syn0))      # combining above 2 results using dict

In [None]:
dict(itertools.islice(vectorized.items(), 2))       # showing first 2 items from dictionary

{'said': array([ 0.07112597,  0.4339686 , -0.26686496,  0.15510337, -0.30903938,
        -0.08031351, -0.4680401 ,  0.00929801,  0.22784938, -0.28907296,
        -0.06546865,  0.10212998,  0.0852081 ,  0.19408864,  0.2758271 ,
         0.3794512 , -0.01959948, -0.01365622, -0.31975102,  0.30496624,
         0.43598786,  0.33860323,  0.22961085,  0.2208227 ,  0.230135  ,
         0.17108983, -0.27428737, -0.1697538 ,  0.12251393,  0.02811553,
         0.5636225 , -0.08428622, -0.18016179,  0.03086792,  0.2405307 ,
         0.04243381, -0.27726218,  0.6380574 ,  0.04940381, -0.08734284,
         0.1168775 ,  0.06064125, -0.38898298, -0.16034219,  0.2691019 ,
        -0.13112718,  0.13625784, -0.1036145 ,  0.28217494, -0.49584588,
         0.6961837 ,  0.4189717 ,  0.15572195,  0.02929845, -0.11712011,
         0.07411481, -0.46422285, -0.12496691,  0.03390019, -0.15995508,
         0.27104992,  0.03025885,  0.0020361 ,  0.25401008, -0.24413104,
        -0.10402504,  0.11544683, -0.009784

In [None]:
veclen = len(vectorized.values())
veclen

18997

In [None]:
np.zeros(veclen)        # generating vector of 0s -  - this concept will be used for training model

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
len(np.zeros(veclen))

18997

In [None]:
# transforming each of the rows in X_train
# for each word in each row find the mean of its vectors, if the word is missing then fill 0s
# as we took only vectors of 500 dimension, 0s wont be visible much

X_train1 = [np.mean([vectorized[w] for w in words if w in vectorized] or [np.zeros(veclen, dtype=object)], axis=0) for words in X_train]

In [None]:
X_train1[0:2]

[array([ 2.90502310e-02,  2.43937492e-01, -1.22020431e-01,  3.95096764e-02,
        -2.11627811e-01, -8.81321654e-02, -4.15482521e-01, -1.33286342e-01,
         3.19586843e-01, -2.71463931e-01,  7.88782686e-02, -6.39215633e-02,
         2.65201256e-02,  1.82513490e-01,  1.88873723e-01,  1.83322921e-01,
         7.01930225e-02,  1.10841185e-01, -1.73782587e-01,  2.80567169e-01,
         3.29006016e-01,  1.07526086e-01,  1.34195119e-01,  1.05937794e-01,
         1.42471254e-01,  1.13040939e-01, -7.30301216e-02, -1.62150756e-01,
         3.12071177e-03,  3.65713462e-02,  3.74599069e-01,  9.38910022e-02,
        -1.18822426e-01, -5.29368035e-02,  1.15344696e-01,  4.58975583e-02,
        -1.88763902e-01,  3.71275544e-01,  1.48655893e-02, -7.38357306e-02,
         2.30919421e-01, -2.03682925e-03, -2.67561704e-01, -1.31778851e-01,
         2.44455621e-01, -1.33829638e-01,  3.06663644e-02, -1.51587099e-01,
         2.86414683e-01, -1.97897300e-01,  4.52413589e-01,  3.95262241e-01,
         1.4

In [None]:
X_train1 = np.array(X_train1)

In [None]:
X_train1

array([[ 0.02905023,  0.24393749, -0.12202043, ...,  0.0690724 ,
         0.01712977,  0.25508857],
       [-0.00045247,  0.25899416, -0.15877154, ...,  0.09535749,
        -0.00203753,  0.24120323],
       [ 0.05558576,  0.23715794, -0.13275072, ...,  0.0588485 ,
         0.02825334,  0.22477861],
       ...,
       [ 0.00206455,  0.1839243 , -0.13404533, ...,  0.0802913 ,
        -0.01496911,  0.22712563],
       [ 0.04114329,  0.26624057, -0.11729035, ...,  0.06257403,
         0.02525073,  0.22398902],
       [ 0.0051788 ,  0.3459836 , -0.12293278, ...,  0.0697543 ,
         0.01562149,  0.27420023]], dtype=float32)

In [None]:
len(X_train1)

1557

In [None]:
len(X_train1[0])

500

In [None]:
# do same operation for test set as done with train set

X_test1 = [np.mean([vectorized[w] for w in words if w in vectorized] or [np.zeros(veclen, dtype=object)], axis=0) for words in X_test]

In [None]:
X_test1 = np.array(X_test1)

In [None]:
len(X_test1)

668

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train1, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = classifier.predict(X_test1)

In [None]:
y_test

845     3
887     2
1621    3
1912    1
1672    0
       ..
997     1
572     0
1487    0
948     1
370     0
Name: Target, Length: 668, dtype: int64

In [None]:
y_pred

array([3, 0, 3, 1, 0, 4, 0, 0, 1, 0, 3, 1, 2, 0, 1, 3, 2, 0, 1, 4, 1, 4,
       1, 0, 0, 0, 0, 2, 3, 2, 1, 2, 4, 0, 3, 2, 4, 0, 4, 4, 3, 4, 3, 3,
       4, 1, 4, 0, 4, 0, 3, 0, 3, 3, 1, 3, 0, 4, 0, 2, 0, 1, 4, 2, 0, 0,
       3, 0, 2, 3, 0, 0, 0, 1, 0, 4, 3, 3, 1, 0, 2, 1, 0, 0, 4, 2, 3, 2,
       3, 4, 3, 3, 0, 1, 3, 4, 2, 3, 0, 1, 3, 4, 4, 0, 2, 3, 4, 1, 2, 3,
       1, 1, 2, 2, 4, 2, 2, 2, 2, 3, 1, 2, 0, 4, 3, 0, 1, 2, 2, 2, 4, 3,
       3, 4, 2, 2, 2, 0, 4, 4, 0, 2, 0, 1, 4, 4, 3, 2, 3, 1, 0, 0, 0, 3,
       4, 4, 0, 3, 3, 0, 0, 1, 2, 3, 4, 0, 4, 1, 0, 0, 3, 1, 1, 3, 3, 2,
       2, 3, 4, 0, 3, 3, 2, 2, 1, 3, 2, 4, 4, 3, 1, 0, 3, 2, 4, 0, 1, 0,
       3, 2, 0, 4, 0, 4, 0, 2, 4, 0, 1, 4, 2, 3, 2, 3, 1, 1, 2, 3, 4, 4,
       3, 1, 2, 4, 4, 2, 3, 0, 4, 3, 3, 1, 1, 3, 2, 4, 2, 0, 3, 3, 0, 3,
       0, 1, 4, 3, 3, 0, 0, 4, 0, 3, 2, 1, 0, 2, 2, 4, 2, 0, 3, 1, 4, 1,
       0, 1, 4, 4, 4, 3, 4, 4, 0, 1, 4, 3, 1, 1, 0, 0, 0, 1, 0, 4, 2, 0,
       4, 0, 4, 1, 0, 4, 1, 4, 4, 0, 2, 1, 4, 1, 2,

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[141   1   6   0   5]
 [  2 103   2   3   6]
 [  9   3 110   2   1]
 [  0   2   0 152   0]
 [  2   5   2   2 109]]


In [None]:
print('Accuracy: ', round(accuracy_score(y_test, y_pred)*100,2),'%')

Accuracy:  92.07 %


Using LSA concept, we got accuracy 83.98%. Using Word2Vec concept, we got accuracy 92.07%. In both the models, we used Logistic Regression as classifier.