<h1 style="color:gray;font-family:segoe ui">P6 : NEWS GROUP TEXT CLASSIFIER</h1>
<hr>

In [489]:
import numpy as np
import pandas as pd
import warnings
import re
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 800)
import random, os, nltk, unidecode

<h1 style="color:gray;font-family:segoe ui">List all the labels we are working with.</h1>

In [126]:
labels = [name for name in os.listdir('NEWS/20_newsgroups/')]
labels

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

<h1 style="color:gray;font-family:segoe ui">Lets shuffle the files and split them into Train and Test sets.</h1>

In [174]:
X_train_filenames = []
X_test_filenames = []
Y_train_labels = []
Y_test_labels = []
for label in labels:
    x = os.listdir('NEWS/20_newsgroups/'+label)
    random.shuffle(x)
    X_train_filenames.extend(x[:600])
    X_test_filenames.extend(x[600:])
    Y_train_labels.extend([label for item in x[:600]])
    Y_test_labels.extend([label for item in x[600:]])

<h1 style="color:gray;font-family:segoe ui">Pre-processing</h1>
<ul style="color:gray;font-family:segoe ui">
<li>Given the text corpus, we remove any accent, e.g. José to Jose, etc.</li>
<li>Remove any punctuation.</li>
<li>Convert to lowercase</li>
<li>Tokenize using NLTK tokenizer</li>
<li>Remove punctuation from Stopwords, since input is also punctuation free. Remove stopwords and non-alpha tokens from the input.</li>
<li>Use lemmatizer for reducing tokens to base form.</li>
</ul>

In [313]:
def remove_accent(doc):
    return unidecode.unidecode(doc)

def remove_punct(doc):
    return re.sub(r"[^a-zA-Z0-9\']+", ' ', doc)

def to_lowercase(doc):
    new_words = doc.split(" ")
    for i,word in enumerate(new_words):
        new_words[i] = word.strip().lower()
    return " ".join(new_words)

def tokenize(doc):
    return nltk.tokenize.TreebankWordTokenizer().tokenize(doc)

def remove_stopwords_and_numbers(token_arr):
    s = stopwords.words('english')
    s = list(map(lambda x: remove_punct(x),s))
    result = [token for token in token_arr if token not in s]
    result = [token for token in result if str.isalpha(token)]
    return result

def lemmatize_words(arr):
    lemma = nltk.wordnet.WordNetLemmatizer()
    return (lemma.lemmatize(w) for w in arr if len(lemma.lemmatize(w)) > 2)

In [314]:
def routine(doc):
    ra = remove_accent(doc)
    rp = remove_punct(ra)
    tl = to_lowercase(rp)
    tk = tokenize(tl)
    rs = remove_stopwords_and_numbers(tk)
    return lemmatize_words(rs)

In [315]:
from sklearn.feature_extraction.text import CountVectorizer

In [316]:
len(X_train_filenames)

12000

<h1 style="color:gray;font-family:segoe ui"><i>corpus</i> contains all the training data for each training document.</h1>

In [317]:
corpus = []
for i, filename in enumerate(X_train_filenames):
    with open(f'./NEWS/20_newsgroups/{Y_train_labels[i]}/{filename}','r') as f:
        text = f.readlines()
        text = " ".join(text)
        textgen = routine(text)
        corpus.append(" ".join(list(textgen)))
        f.close()
    if i%1000 == 0:
        print(i,end=" ")

0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 

In [320]:
vectorizer = CountVectorizer()
count_vectors = vectorizer.fit_transform(corpus)

In [328]:
len(vectorizer.get_feature_names())

81183

<h1 style="color:gray;font-family:segoe ui"><i>tf</i> below contains all the vocabulary over the entire training data, with respective counts per document.</h1>

In [357]:
tf = count_vectors.toarray()

In [361]:
document_counts = np.count_nonzero(tf,axis=0)

In [394]:
doc_counts_with_labels = list(zip(document_counts,vectorizer.get_feature_names()))

In [414]:
tf.shape

(12000, 81183)

In [395]:
len(doc_counts_with_labels)

81183

<h1 style="color:gray;font-family:segoe ui">Let us create a Dataframe out of the complete vocab with total word counts.</h1>

In [407]:
vocab_full = pd.DataFrame(data=doc_counts_with_labels,columns=['counts','word'])

In [408]:
vocab_full.sort_values(by=['counts'],ascending=False,inplace=True)
vocab_full.reset_index(inplace=True)

<h1 style="color:gray;font-family:segoe ui">Let's slice that Dataframe and pick top 5000 tokens based on Document Frequency.</h1>

In [410]:
vocab5000 = vocab_full.iloc[:5000,:]
vocab5000.head()

Unnamed: 0,index,counts,word
0,20150,12000,edu
1,66308,12000,srv
2,12050,12000,cmu
3,52015,12000,path
4,9551,12000,cantaloupe


<h1 style="color:gray;font-family:segoe ui">Time to calculate class-wise counts for each token in the 5000 word long vocab.</h1>

In [417]:
for i,j in enumerate(range(0,12000,600)):
    temp = tf[j:j+600]
    sums = temp.sum(axis=0)
    vocab5000[f'count_in_class_{i}'] = vocab5000['index'].apply(lambda x: sums[x])
    print(f'Done for class {i}')

Done for class 0
Done for class 1
Done for class 2
Done for class 3
Done for class 4
Done for class 5
Done for class 6
Done for class 7
Done for class 8
Done for class 9
Done for class 10
Done for class 11
Done for class 12
Done for class 13
Done for class 14
Done for class 15
Done for class 16
Done for class 17
Done for class 18
Done for class 19


In [421]:
vocab5000.head()

Unnamed: 0,index,counts,word,count_in_class_0,count_in_class_1,count_in_class_2,count_in_class_3,count_in_class_4,count_in_class_5,count_in_class_6,count_in_class_7,count_in_class_8,count_in_class_9,count_in_class_10,count_in_class_11,count_in_class_12,count_in_class_13,count_in_class_14,count_in_class_15,count_in_class_16,count_in_class_17,count_in_class_18,count_in_class_19
0,20150,12000,edu,5585,4624,4663,4266,4883,3920,4727,4907,3815,5357,4613,4393,4481,5120,4986,6009,5979,5303,5568,5596
1,66308,12000,srv,967,972,971,954,942,931,998,857,798,715,769,1042,950,960,992,889,1088,1125,1266,1255
2,12050,12000,cmu,1660,1545,1495,1492,1612,1477,1539,1437,1300,1100,1536,1692,1547,1550,1736,1187,1955,1677,1890,2095
3,52015,12000,path,602,620,638,607,611,677,602,611,604,607,601,606,618,606,615,611,605,610,606,616
4,9551,12000,cantaloupe,748,787,796,704,678,764,831,704,622,609,606,865,726,763,817,771,828,940,1067,1011


In [425]:
sums = vocab5000.iloc[:,3:].sum(axis=0).values

<h2 style="color:gray;font-family:segoe ui">This is where we calculate the likelihood of finding a token given a particular class. We use Laplace smoothing of 30, as suggested in the question.</h2>
<br>
<br>
\begin{equation*}
\hat{P}\left(w_i|\ c\right)\ =\ \frac{count(w_i,c)+1}{\sum_{w \in V}count\left(w,c\right)+30}
\end{equation*}

In [426]:
for i in range(20):
    vocab5000[f'prob_w_class_{i}'] = vocab5000[f'count_in_class_{i}'].apply(lambda x: (x+1)/(sums[i]+30))

In [427]:
vocab5000.head()

Unnamed: 0,index,counts,word,count_in_class_0,count_in_class_1,count_in_class_2,count_in_class_3,count_in_class_4,count_in_class_5,count_in_class_6,count_in_class_7,count_in_class_8,count_in_class_9,count_in_class_10,count_in_class_11,count_in_class_12,count_in_class_13,count_in_class_14,count_in_class_15,count_in_class_16,count_in_class_17,count_in_class_18,count_in_class_19,prob_w_class_0,prob_w_class_1,prob_w_class_2,prob_w_class_3,prob_w_class_4,prob_w_class_5,prob_w_class_6,prob_w_class_7,prob_w_class_8,prob_w_class_9,prob_w_class_10,prob_w_class_11,prob_w_class_12,prob_w_class_13,prob_w_class_14,prob_w_class_15,prob_w_class_16,prob_w_class_17,prob_w_class_18,prob_w_class_19
0,20150,12000,edu,5585,4624,4663,4266,4883,3920,4727,4907,3815,5357,4613,4393,4481,5120,4986,6009,5979,5303,5568,5596,0.047244,0.045154,0.055767,0.05241,0.066009,0.042967,0.069742,0.056355,0.048223,0.060657,0.047462,0.036511,0.055162,0.048692,0.043223,0.049834,0.04721,0.033248,0.03917,0.046516
1,66308,12000,srv,967,972,971,954,942,931,998,857,798,715,769,1042,950,960,992,889,1088,1125,1266,1255,0.008187,0.009499,0.011622,0.01173,0.012745,0.010213,0.014736,0.009852,0.010097,0.008106,0.007921,0.008667,0.011704,0.009137,0.008606,0.00738,0.008597,0.007058,0.008912,0.010438
2,12050,12000,cmu,1660,1545,1495,1492,1612,1477,1539,1437,1300,1100,1536,1692,1547,1550,1736,1187,1955,1677,1890,2095,0.014048,0.015094,0.017887,0.018338,0.0218,0.016196,0.022716,0.016511,0.016441,0.012464,0.01581,0.014068,0.019052,0.014747,0.015055,0.009851,0.015442,0.010519,0.013301,0.017419
3,52015,12000,path,602,620,638,607,611,677,602,611,604,607,601,606,618,606,615,611,605,610,606,616,0.0051,0.006063,0.00764,0.007468,0.008271,0.00743,0.008895,0.007027,0.007645,0.006883,0.006193,0.005044,0.007618,0.005771,0.005339,0.005075,0.004784,0.00383,0.004269,0.005128
4,9551,12000,cantaloupe,748,787,796,704,678,764,831,704,622,609,606,865,726,763,817,771,828,940,1067,1011,0.006335,0.007693,0.00953,0.008659,0.009177,0.008383,0.012273,0.008095,0.007873,0.006906,0.006244,0.007196,0.008948,0.007264,0.00709,0.006401,0.006545,0.005899,0.007512,0.008411


In [435]:
temp = vocab5000.set_index('word',drop=True)
temp.head()

Unnamed: 0_level_0,index,counts,count_in_class_0,count_in_class_1,count_in_class_2,count_in_class_3,count_in_class_4,count_in_class_5,count_in_class_6,count_in_class_7,count_in_class_8,count_in_class_9,count_in_class_10,count_in_class_11,count_in_class_12,count_in_class_13,count_in_class_14,count_in_class_15,count_in_class_16,count_in_class_17,count_in_class_18,count_in_class_19,prob_w_class_0,prob_w_class_1,prob_w_class_2,prob_w_class_3,prob_w_class_4,prob_w_class_5,prob_w_class_6,prob_w_class_7,prob_w_class_8,prob_w_class_9,prob_w_class_10,prob_w_class_11,prob_w_class_12,prob_w_class_13,prob_w_class_14,prob_w_class_15,prob_w_class_16,prob_w_class_17,prob_w_class_18,prob_w_class_19
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
edu,20150,12000,5585,4624,4663,4266,4883,3920,4727,4907,3815,5357,4613,4393,4481,5120,4986,6009,5979,5303,5568,5596,0.047244,0.045154,0.055767,0.05241,0.066009,0.042967,0.069742,0.056355,0.048223,0.060657,0.047462,0.036511,0.055162,0.048692,0.043223,0.049834,0.04721,0.033248,0.03917,0.046516
srv,66308,12000,967,972,971,954,942,931,998,857,798,715,769,1042,950,960,992,889,1088,1125,1266,1255,0.008187,0.009499,0.011622,0.01173,0.012745,0.010213,0.014736,0.009852,0.010097,0.008106,0.007921,0.008667,0.011704,0.009137,0.008606,0.00738,0.008597,0.007058,0.008912,0.010438
cmu,12050,12000,1660,1545,1495,1492,1612,1477,1539,1437,1300,1100,1536,1692,1547,1550,1736,1187,1955,1677,1890,2095,0.014048,0.015094,0.017887,0.018338,0.0218,0.016196,0.022716,0.016511,0.016441,0.012464,0.01581,0.014068,0.019052,0.014747,0.015055,0.009851,0.015442,0.010519,0.013301,0.017419
path,52015,12000,602,620,638,607,611,677,602,611,604,607,601,606,618,606,615,611,605,610,606,616,0.0051,0.006063,0.00764,0.007468,0.008271,0.00743,0.008895,0.007027,0.007645,0.006883,0.006193,0.005044,0.007618,0.005771,0.005339,0.005075,0.004784,0.00383,0.004269,0.005128
cantaloupe,9551,12000,748,787,796,704,678,764,831,704,622,609,606,865,726,763,817,771,828,940,1067,1011,0.006335,0.007693,0.00953,0.008659,0.009177,0.008383,0.012273,0.008095,0.007873,0.006906,0.006244,0.007196,0.008948,0.007264,0.00709,0.006401,0.006545,0.005899,0.007512,0.008411


In [472]:
labels_dict = {}
for i, lab in enumerate(labels):
    labels_dict[i] = lab

In [486]:
log_sums_laplacian = list(map(lambda x: np.log(1/(30+x)),sums))

<h1 style="color:gray;font-family:segoe ui">Lets predict for Training data. Please note that we do not use prior probabilities for calculating Maximum Aposteriori Probabilities. That is because Class sizes for all labels in Training set are equal.</h1>

In [488]:
train_predictions = []

for i,text in enumerate(corpus):
    tokens = text.split(" ")
    x = Counter(tokens)
    good_keys = temp.index.intersection(x.keys())
    bad_keys = list(set(x.keys())-set(good_keys))
    best_prob = -float('inf')
    best_label = None
    for j in range(20):
        logsum = 0
        for key in good_keys:
            logsum += x[key]*np.log(temp.loc[key,f'prob_w_class_{j}'])
        for key in bad_keys:
            logsum += x[key]*log_sums_laplacian[j]
        if logsum > best_prob:
            best_prob = logsum
            best_label = j
    train_predictions.append(labels_dict[best_label])
    if i%200 == 0:
        print(i,end=" ")

0 200 400 600 800 1000 1200 1400 1600 1800 2000 2200 2400 2600 2800 3000 3200 3400 3600 3800 4000 4200 4400 4600 4800 5000 5200 5400 5600 5800 6000 6200 6400 6600 6800 7000 7200 7400 7600 7800 8000 8200 8400 8600 8800 9000 9200 9400 9600 9800 10000 10200 10400 10600 10800 11000 11200 11400 11600 11800 

<h1 style="color:gray;font-family:segoe ui">Training Accuracy of 90.6%. Not bad!</h1>

In [491]:
train_accuracy_5k = accuracy_score(train_predictions,Y_train_labels)
train_accuracy_5k

0.906

<h1 style="color:gray;font-family:segoe ui">Load Test data.</h1>

In [492]:
test_corpus = []
for i, filename in enumerate(X_test_filenames):
    with open(f'./NEWS/20_newsgroups/{Y_test_labels[i]}/{filename}','r') as f:
        text = f.readlines()
        text = " ".join(text)
        textgen = routine(text)
        test_corpus.append(" ".join(list(textgen)))
        f.close()
    if i%1000 == 0:
        print(i,end=" ")

0 1000 2000 3000 4000 5000 6000 7000 

<h1 style="color:gray;font-family:segoe ui">Lets predict for Test data.</h1>

In [493]:
test_predictions = []

for i,text in enumerate(test_corpus):
    tokens = text.split(" ")
    x = Counter(tokens)
    good_keys = temp.index.intersection(x.keys())
    bad_keys = list(set(x.keys())-set(good_keys))
    best_prob = -float('inf')
    best_label = None
    for j in range(20):
        logsum = 0
        for key in good_keys:
            logsum += x[key]*np.log(temp.loc[key,f'prob_w_class_{j}'])
        for key in bad_keys:
            logsum += x[key]*log_sums_laplacian[j]
        if logsum > best_prob:
            best_prob = logsum
            best_label = j
    test_predictions.append(labels_dict[best_label])
    if i%200 == 0:
        print(i,end=" ")

0 200 400 600 800 1000 1200 1400 1600 1800 2000 2200 2400 2600 2800 3000 3200 3400 3600 3800 4000 4200 4400 4600 4800 5000 5200 5400 5600 5800 6000 6200 6400 6600 6800 7000 7200 7400 7600 7800 

<h1 style="color:gray;font-family:segoe ui">Test accuracy of 84.88%. We can live with that.</h1>

In [494]:
test_accuracy_5k = accuracy_score(test_predictions,Y_test_labels)
test_accuracy_5k

0.8488183068650744

<h1 style="color:gray;font-family:segoe ui">Time to expand vocabulary to 10000 tokens.</h1>

In [None]:
vocab10000 = vocab_full.iloc[:10000,:]
vocab10000.head()

In [496]:
for i,j in enumerate(range(0,12000,600)):
    temp = tf[j:j+600]
    sums = temp.sum(axis=0)
    vocab10000[f'count_in_class_{i}'] = vocab10000['index'].apply(lambda x: sums[x])
    print(f'Done for class {i}')

Done for class 0
Done for class 1
Done for class 2
Done for class 3
Done for class 4
Done for class 5
Done for class 6
Done for class 7
Done for class 8
Done for class 9
Done for class 10
Done for class 11
Done for class 12
Done for class 13
Done for class 14
Done for class 15
Done for class 16
Done for class 17
Done for class 18
Done for class 19


In [497]:
sums = vocab10000.iloc[:,3:].sum(axis=0).values

In [498]:
for i in range(20):
    vocab10000[f'prob_w_class_{i}'] = vocab10000[f'count_in_class_{i}'].apply(lambda x: (x+1)/(sums[i]+30))

In [499]:
temp = vocab10000.set_index('word',drop=True)
temp.head()

Unnamed: 0_level_0,index,counts,count_in_class_0,count_in_class_1,count_in_class_2,count_in_class_3,count_in_class_4,count_in_class_5,count_in_class_6,count_in_class_7,count_in_class_8,count_in_class_9,count_in_class_10,count_in_class_11,count_in_class_12,count_in_class_13,count_in_class_14,count_in_class_15,count_in_class_16,count_in_class_17,count_in_class_18,count_in_class_19,prob_w_class_0,prob_w_class_1,prob_w_class_2,prob_w_class_3,prob_w_class_4,prob_w_class_5,prob_w_class_6,prob_w_class_7,prob_w_class_8,prob_w_class_9,prob_w_class_10,prob_w_class_11,prob_w_class_12,prob_w_class_13,prob_w_class_14,prob_w_class_15,prob_w_class_16,prob_w_class_17,prob_w_class_18,prob_w_class_19
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
edu,20150,12000,5585,4624,4663,4266,4883,3920,4727,4907,3815,5357,4613,4393,4481,5120,4986,6009,5979,5303,5568,5596,0.043796,0.041129,0.052048,0.049121,0.062055,0.040017,0.065001,0.051711,0.043812,0.055546,0.042987,0.033698,0.0505,0.044131,0.039463,0.046466,0.04343,0.030419,0.036094,0.04326
srv,66308,12000,967,972,971,954,942,931,998,857,798,715,769,1042,950,960,992,889,1088,1125,1266,1255,0.007589,0.008653,0.010847,0.010994,0.011981,0.009512,0.013734,0.00904,0.009173,0.007423,0.007174,0.007999,0.010715,0.008282,0.007858,0.006881,0.007909,0.006458,0.008212,0.009708
cmu,12050,12000,1660,1545,1495,1492,1612,1477,1539,1437,1300,1100,1536,1692,1547,1550,1736,1187,1955,1677,1890,2095,0.013023,0.013748,0.016695,0.017187,0.020494,0.015084,0.021172,0.015151,0.014937,0.011414,0.01432,0.012984,0.017442,0.013366,0.013745,0.009185,0.014205,0.009624,0.012256,0.0162
path,52015,12000,602,620,638,607,611,677,602,611,604,607,601,606,618,606,615,611,605,610,606,616,0.004728,0.005522,0.007131,0.006999,0.007776,0.00692,0.00829,0.006448,0.006946,0.006303,0.005609,0.004655,0.006974,0.005231,0.004874,0.004732,0.004401,0.003504,0.003934,0.004769
cantaloupe,9551,12000,748,787,796,704,678,764,831,704,622,609,606,865,726,763,817,771,828,940,1067,1011,0.005872,0.007007,0.008894,0.008116,0.008627,0.007807,0.011438,0.007428,0.007153,0.006324,0.005655,0.006641,0.008191,0.006584,0.006473,0.005969,0.006021,0.005397,0.006922,0.007822


In [500]:
log_sums_laplacian = list(map(lambda x: np.log(1/(30+x)),sums))

<h1 style="color:gray;font-family:segoe ui">Lets predict for Training data with 10k vocab. Our accuracy should improve.</h1>

In [501]:
train_predictions_10k = []

for i,text in enumerate(corpus):
    tokens = text.split(" ")
    x = Counter(tokens)
    good_keys = temp.index.intersection(x.keys())
    bad_keys = list(set(x.keys())-set(good_keys))
    best_prob = -float('inf')
    best_label = None
    for j in range(20):
        logsum = 0
        for key in good_keys:
            logsum += x[key]*np.log(temp.loc[key,f'prob_w_class_{j}'])
        for key in bad_keys:
            logsum += x[key]*log_sums_laplacian[j]
        if logsum > best_prob:
            best_prob = logsum
            best_label = j
    train_predictions_10k.append(labels_dict[best_label])
    if i%200 == 0:
        print(i,end=" ")

0 200 400 600 800 1000 1200 1400 1600 1800 2000 2200 2400 2600 2800 3000 3200 3400 3600 3800 4000 4200 4400 4600 4800 5000 5200 5400 5600 5800 6000 6200 6400 6600 6800 7000 7200 7400 7600 7800 8000 8200 8400 8600 8800 9000 9200 9400 9600 9800 10000 10200 10400 10600 10800 11000 11200 11400 11600 11800 

<h1 style="color:gray;font-family:segoe ui">It does for Training data. Lets check Test data now.</h1>

In [502]:
train_accuracy_10k = accuracy_score(train_predictions_10k,Y_train_labels)
train_accuracy_10k

0.9288333333333333

In [503]:
test_predictions_10k = []

for i,text in enumerate(test_corpus):
    tokens = text.split(" ")
    x = Counter(tokens)
    good_keys = temp.index.intersection(x.keys())
    bad_keys = list(set(x.keys())-set(good_keys))
    best_prob = -float('inf')
    best_label = None
    for j in range(20):
        logsum = 0
        for key in good_keys:
            logsum += x[key]*np.log(temp.loc[key,f'prob_w_class_{j}'])
        for key in bad_keys:
            logsum += x[key]*log_sums_laplacian[j]
        if logsum > best_prob:
            best_prob = logsum
            best_label = j
    test_predictions_10k.append(labels_dict[best_label])
    if i%200 == 0:
        print(i,end=" ")

0 200 400 600 800 1000 1200 1400 1600 1800 2000 2200 2400 2600 2800 3000 3200 3400 3600 3800 4000 4200 4400 4600 4800 5000 5200 5400 5600 5800 6000 6200 6400 6600 6800 7000 7200 7400 7600 7800 

<h1 style="color:gray;font-family:segoe ui">We note that this goes up too.</h1>

In [504]:
test_accuracy_10k = accuracy_score(test_predictions_10k,Y_test_labels)
test_accuracy_10k

0.863698887082656

<h1 style="color:gray;font-family:segoe ui">Summary Scores:</h1>

<table style="color:gray;font-family:segoe ui;font-size:20px">

<tr><td>Training accuracy with 5k vocab:</td> <td>90.6%</td></tr>
<tr><td>Test accuracy with 5k vocab:</td> <td>84.88%</td></tr>
<tr><td>Training accuracy with 10k vocab:</td> <td>92.88%</td></tr>
<tr><td>Test accuracy with 10k vocab:</td> <td>86.37%</td></tr>
</table>