# Imports

In [1]:
# Import tools from external packages:
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from systemtools.number import *
from datatools.jsonutils import *
from datatools.jsonutils import *
from nlptools.basics import *
from nlptools.tokenizer import *

In [2]:
# Import tools of AuthFilt:
from authfilt.buckets import *

# Get your data

Here a list of tuples
Each tuple represent a document and contains the label and a list sentences.
A sentence is composed of tokens (so, for this notebook, you need tokenized docs by sentences and words).
This file contains unbalanced classes (some have a lot of docs compared to others).

In [3]:
# Read your data:
dataset = deserialize("./data/unbalanced.pickle")
bp(dataset)
print("Number of documents: " + str(len(dataset)))

[
  (
    cinie•wordpress.com,
    [ [ Yahoo, News, ..., Obama, . ], [ The, Democrat, ..., Bush, . ], ..., [ Which, corrupt, ..., ask, ? ], [ The, ones, ..., stupid, . ] ]
  ),
  (
    cinie•wordpress.com,
    [ [ Face, it, ..., toast, . ], [ Running, as, ..., fail, . ], ..., [ But, you, ..., are, . ], [ That, clearly, ..., math, . ] ]
  ),
  ...,
  (
    trendaz.com,
    [ [ t, least, ..., reported, . ], [ ", A, ..., dpa, . ], ..., [ Pakistan, tribal, ..., Afghanistan, . ], [ The, country, ..., success, . ] ]
  ),
  (
    trendaz.com,
    [ [ (, dpa, ..., Tuesday, . ], [ ", He, ..., Post, . ], ..., [ Mike, Webster, ..., issues, . ], [ Police, filmed, ..., authorities, . ] ]
  )
]
Number of documents: 15098


# Create required data structures

In [4]:
# Create a dict mapping class -> ids -> documents.
# If you have a very large dataset, you need to compute this using generators or with distributed computing.
id = 0
documents = dict()
for label, sentences in dataset:
    if label not in documents:
        documents[label] = dict()
    documents[label][id] = sentences
    id += 1
bp(documents)

{
  03530.com: 
  {
    7515: [ [ A, new, ..., patterns, . ], [ Using, the, ..., words, . ], ..., [ ", The, ..., start, . ], [ Source, :, ..., British, Columbia ] ],
    7516: [ [ Tooth, bonding, ..., discoloured, . ], [ It, usually, ..., two, . ], ..., [ Many, people, ..., smile, . ], [ Get, information, ..., dentists, in ] ],
    7517: [ [ August, __int_2__, ..., risk, . ], [ The, study, ..., patients, . ], ..., [ ", Low, ..., noted, . ], [ SOURCE, :, ..., Aug., __int_2__ ] ],
    7518: [ [ NC, State, ..., setting, . ], [ Dr., Steven, ..., patients, . ], ..., [ A, lot, ..., medicine, . ], [ For, some, ..., humans, . ] ],
    7519: [ [ Vitamin, D, ..., results, . ], [ In, this, ..., and, __int_2__ ], ..., [ Annals, of, ..., articles, . ], [ Source, :, ..., of, Physicians ] ],
    ...,
    7625: [ [ Oxygen, Biotherapeutics, ..., TBI, ) ], [ Oxycyte, is, ..., carrier, . ], ..., [ Additional, information, ..., release, . ], [ This, caution, ..., of, __int_4__ ] ],
    7626: [ [ The, Fat,

In [5]:
# Create a TokenCount-structure mapping class -> id -> tokens count:
idTokensCount = dict()
for label, docs in documents.items():
    if label not in idTokensCount:
        idTokensCount[label] = dict()
    for id in docs:
        idTokensCount[label][id] = len(flattenLists(docs[id]))
bp(idTokensCount)

{
  03530.com: { 7515: 371, 7516: 242, 7517: 305, 7518: 451, 7519: 380, ..., 7625: 678, 7626: 496, 7627: 997, 7628: 604, 7629: 604 },
  1st-guide.org: { 7233: 684, 7234: 770, 7235: 675, 7236: 495, 7237: 675, ..., 7344: 800, 7345: 733, 7346: 545, 7347: 1311, 7348: 636 },
  adrienne•livejournal.com: { 2013: 208, 2014: 806 },
  amanda•livejournal.com: { 10871: 414, 10872: 121, 10873: 144, 10874: 277, 10875: 582, ..., 10958: 426, 10959: 285, 10960: 221, 10961: 2170, 10962: 223 },
  amanda•wordpress.com: { 3499: 211, 3500: 143, 3501: 521, 3502: 401, 3503: 1440, ..., 3514: 620, 3515: 307, 3516: 377, 3517: 154, 3518: 96 },
  ...,
  weblogsinc.com: { 5134: 475, 5135: 368, 5136: 252, 5137: 270, 5138: 817, ..., 5182: 579, 5183: 134, 5184: 675, 5185: 238, 5186: 551 },
  weightless dolls•livejournal.com: { 7349: 287, 7350: 265, 7351: 125, 7352: 227, 7353: 103, ..., 7363: 656, 7364: 197, 7365: 607, 7366: 251, 7367: 179 },
  wired.com: { 13224: 1619, 13225: 289, 13226: 346, 13227: 337, 13228: 958, .

In [6]:
# Display statistics (docs counct, sentences count, tokens count) for each class:
stats = []
for label, docs in documents.items():
    current = ""
    current += label[:30] + " "
    current += "d=" + str(len(docs)) + ", "
    sentenceCount = 0
    tokensCount = 0
    for id, doc in docs.items():
        for sentence in doc:
            sentenceCount += 1
            for token in sentence:
                tokensCount += 1
    current += "s=" + str(sentenceCount) + ", t=" + str(tokensCount)
    stats.append(current)
bp(sorted(stats, key=lambda x: getAllNumbers(x)[-3], reverse=True), 4)

[
  livejournal.com d=1500, s=53323, t=894781,
  ning.com d=496, s=11458, t=249702,
  blog.com d=491, s=14389, t=296090,
  salon.com d=480, s=19933, t=443980,
  baltimoresun.com d=475, s=14447, t=321028,
  blogs.com d=454, s=11700, t=240211,
  yachtchartersmagazine.com d=428, s=15283, t=474785,
  blogdig.net d=424, s=6146, t=140883,
  intl.in d=406, s=12449, t=265879,
  mercurynews.com d=402, s=11328, t=268828,
  ...,
  egl community sales•livejourna d=12, s=196, t=4138,
  cynthia•livejournal.com d=12, s=266, t=5191,
  seductivegirl•livejournal.com d=11, s=307, t=6240,
  joy•livejournal.com d=11, s=278, t=4597,
  dreamsandpeople•livejournal.co d=11, s=243, t=6250,
  runaway tales•livejournal.com d=10, s=700, t=9924,
  listannabel•livejournal.com d=10, s=383, t=6425,
  cinie•wordpress.com d=9, s=254, t=5591,
  adrienne•livejournal.com d=2, s=69, t=1014,
  niki•livejournal.com d=2, s=55, t=736
]


# Generate buckets of documents:

You can run the generation of multiple set of buckets in parallel with different parameter (`maxLabelsPerBucket`, `tokensPerBucket`, `maxVarianceRatio`, `maxConsecutiveBadBucketCount` and `maxConsecutiveNoChangeCount`) and keep the set having a few ids deletion and a few number of buckets 

In [7]:
# Generate buckets:
buckets = makeBuckets(idTokensCount, tokensPerBucket=1e6, maxLabelsPerBucket=len(documents) / 4, maxVarianceRatio=0.2, verbose=True)

--------------------
Remaining labels: 104
Remaining tokens: 9525801 (100.0%)
Allocated labels: 0
This bucket is not a valid bucket...


This bucket is not ADDED


--------------------
Remaining labels: 104
Remaining tokens: 9525801 (100.0%)
Allocated labels: 0
This bucket is not a valid bucket...


This bucket is not ADDED


--------------------
Remaining labels: 104
Remaining tokens: 9525801 (100.0%)
Allocated labels: 0
This bucket is not a valid bucket...


This bucket is not ADDED


We removed 2013 (208 tokens) from adrienne•livejournal.com in remaining
We removed 2014 (806 tokens) from adrienne•livejournal.com in remaining
--------------------
Remaining labels: 103
Remaining tokens: 9524787 (99.98%)
Allocated labels: 0
This bucket is not a valid bucket...


This bucket is not ADDED


We removed 12019 (326 tokens) from niki•livejournal.com in remaining
We removed 12020 (410 tokens) from niki•livejournal.com in remaining
--------------------
Remaining labels: 102
Remaining tokens: 9

In [8]:
# Print some stats on generated buckets:
bp(buckets[-2:])
print("Number of buckets: " + str(len(buckets)))
for bucket in buckets[-2:]:
    bucketStats(bucket)

[
  {
    baltimoresun.com: { 12340: 437, 12356: 865, 12361: 1142, 12363: 1370, 12370: 2330, ..., 12731: 1030, 12756: 934, 12760: 404, 12776: 368, 12792: 1038 },
    blog.com: { 1525: 216, 1528: 303, 1532: 437, 1533: 326, 1535: 1061, ..., 1979: 489, 1981: 135, 1991: 255, 2008: 269, 2012: 757 },
    blogdig.net: { 2392: 104, 2393: 384, 2395: 118, 2418: 124, 2419: 102, ..., 2810: 656, 2811: 111, 2812: 333, 2813: 116, 2814: 475 },
    blogs.com: { 3526: 353, 3532: 337, 3534: 107, 3552: 110, 3565: 1707, ..., 3942: 101, 3949: 124, 3950: 2271, 3966: 161, 3967: 319 },
    google.com: { 4625: 149, 4627: 149, 4630: 895, 4642: 352, 4646: 174, ..., 4855: 178, 4856: 154, 4860: 885, 4864: 297, 4870: 703 },
    ...,
    techcrunch.com: { 14411: 524, 14413: 440, 14415: 611, 14416: 377, 14420: 407, ..., 14568: 414, 14574: 641, 14576: 102, 14582: 2246, 14584: 488 },
    technophobiac.net: { 11638: 419, 11656: 654, 11659: 801, 11665: 379, 11674: 503, ..., 12001: 749, 12006: 906, 12007: 531, 12016: 389, 

# Extraction of black n-grams from each bucket and merge

This can be done in parallel if your corpus is large.

In [10]:
bngrams = dict()
for bucket in pb(shuffle(buckets)):
    # First we make class-documents:
    cds, labels = [], []
    for label, ids in bucket.items():
        ids = list(ids.keys())
        labels.append(label)
        cd = []
        for id in ids:
            cd += documents[label][id]
        cds.append(cd)
    # Then we compute TFIDF with a ngrams range 1 to 3:
    tfidf = TFIDF(cds, doLower=True, ngramRange=(1, 3), minDF=1, verbose=False)
    # Then we generate bngrams:
    bns = tfidf.getBlackNgrams(0.3)
    # And we merge it in the main black n-grams structure (bngrams):
    for i in range(len(labels)):
        currentLabel = labels[i]
        currentBNS = bns[i]
        if currentLabel not in bngrams:
            bngrams[currentLabel] = set()
        for bn in currentBNS:
            bngrams[currentLabel].add(bn)
bp(bngrams)

  2% [                    ]
  8% [=                   ] (10m 49.863s left)
 17% [===                 ] (10m 45.026s left)
 26% [=====               ] (11m 8.722s left)
{
  03530.com: { ' nasal spray, ' zomig, ..., zomig ', zomig ' nasal },
  1st-guide.org: { , d -, , g, ..., zinc, zinc countertops },
  amanda•livejournal.com: { " aoi, " uruha, ..., you wish you, your animation },
  amanda•wordpress.com: { , i no, , o my, ..., wounds and, wounds and issues },
  angellilu•livejournal.com: { - yos ;, a jerk, ..., yo -, yo - yos },
  ...,
  weblogsinc.com: { ( nasdaq, ( nasdaq :, ..., wal - mart, wall street },
  weightless dolls•livejournal.com: { " what is, , from my, ..., your problem, your problem ? },
  wired.com: { ! eckhardt this, ! put, ..., zucker, zuckerberg },
  xanga.com: { " fleece, " fleece ", ..., your therapist will, zolpidem },
  yachtchartersmagazine.com: { ! ( tm, ! nosql, ..., zacks equity research, zealand pharma }
}


# Deletion of sentences

Can be done in parallel.

In [11]:
# Deletion:
filteredDataset = []
for label, sentences in pb(dataset):
    if label in bngrams:
        newSentences = []
        bns = bngrams[label]
        for sentence in sentences:
            ngrams = extractNgrams([sentence], ngrams=1, doLower=True)
            ngrams = ngrams.union(extractNgrams([sentence], ngrams=2, doLower=True))
            ngrams = ngrams.union(extractNgrams([sentence], ngrams=3, doLower=True))
            foundBlack = False
            for bn in bns:
                if bn in ngrams:
                    foundBlack = True
                    break
            if not foundBlack:
                newSentences.append(sentence)
        filteredDataset.append((label, newSentences))
    else:
        filteredDataset.append((label, sentences))

  0% [                    ]
  9% [=                   ] (1m 54.817s left)
 19% [===                 ] (1m 31.5s left)
 29% [=====               ] (1m 24.063s left)


In [12]:
# We display 2 docs (not filtered and filtered):
for i in range(len(dataset)):
    doc1 = dataset[i][1]
    doc2 = filteredDataset[i][1]
    if len(doc2) < len(doc1) - 2 and len(doc2) > 3 and len(doc2) < 5:
        print("Original doc:")
        print(detokenize(doc1))
        print("\nFiltered doc:")
        print(detokenize(doc2))
        break

Original doc:
Uttar Pradesh Health Minister Anant Kumar Misra Tuesday blamed the central government for the various diseases threatening to take on an epidemic proportion in the state.
He sought to blame the central government not only for the "inadequate supply" but also "substandard" and "delayed" supply of the required vaccines for preventing diseases like Japanese Encephalitis, viral encephalitis, malaria as well other vector borne diseases.
The minister, a cousin of the all powerful Bahujan Samaj Party (BSP) national general secretary Satish Chandra Misra, also accused the government of "failure to maintain the cold chain for storage of various vaccines in the state"
He admitted that there were as many as __int_3__ deaths on account of the spread of "acute encephalitis in Gorakhpur, Basti, Maharajganj, Siddharthnagar, Sant Kabir Nagar, Kushinagar, Bahraich spread across eastern Uttar Pradesh"
According to him, we have detected about cases of acute encephalitis in these districts s

In [13]:
# Remove empty docs:
newFilteredDataset = []
labelsOfEmptyDocs = dict()
count = 0
for label, doc in filteredDataset:
    if doc is None or len(doc) == 0:
        if label not in labelsOfEmptyDocs:
            labelsOfEmptyDocs[label] = 0
        labelsOfEmptyDocs[label] += 1
        count += 1
    else:
        newFilteredDataset.append((label, doc))
bp(labelsOfEmptyDocs, 3)
print(str(count) + " docs deleted.")
filteredDataset = newFilteredDataset

{
  03530.com: 1,
  baltimoresun.com: 2,
  blog.com: 2,
  blogdig.net: 2,
  climateark.org: 8,
  downloadsquad.com: 1,
  dragonspam_ post your dragcave eggs•livejournal.com: 4,
  ...,
  techcrunch.com: 2,
  technophobiac.net: 4,
  theoffside.com: 1,
  trendaz.com: 11,
  web-directory.in: 1,
  xanga.com: 2,
  yachtchartersmagazine.com: 19
}
209 docs deleted.


In [14]:
# Print of the final result:
bp(filteredDataset)

[
  (
    cinie•wordpress.com,
    [ [ The, renegade, ..., statement, . ], [ The, nerve, ..., people, ! ], ..., [ Founded, in, ..., nation, . ], [ Which, corrupt, ..., ask, ? ] ]
  ),
  (
    cinie•wordpress.com,
    [ [ Face, it, ..., toast, . ], [ Running, as, ..., fail, . ], ..., [ No, gimmick, ..., ticket, . ], [ That, clearly, ..., math, . ] ]
  ),
  ...,
  (
    trendaz.com,
    [ [ ", Three, ..., added, . ], [ However, ,, ..., as, __int_2__ ], ..., [ Security, forces, ..., anonymity, . ], [ The, country, ..., success, . ] ]
  ),
  (
    trendaz.com,
    [ [ ", He, ..., Post, . ], [ ", It, ..., disgusting, . ], [ Mike, Webster, ..., issues, . ], [ Police, filmed, ..., authorities, . ] ]
  )
]


In [15]:
# Display of % deleted sentence (a little bit more than the deletion ratio of 0.3):
sentencesCount = 0
fSentencesCount = 0
for label, doc in dataset:
    sentencesCount += len(doc)
for label, doc in filteredDataset:
    fSentencesCount += len(doc)
print("sentencesCount: " + str(sentencesCount))
print("fSentencesCount: " + str(fSentencesCount))
print(str((sentencesCount - fSentencesCount) / sentencesCount * 100)[:4] + "% deleted")

sentencesCount: 447792
fSentencesCount: 296522
33.7% deleted
