In [3]:
import csv, re, pickle, itertools, progressbar, importlib
import numpy as np
import nltk
from nltk import word_tokenize

from categories import categories
import filters, vocabulary_tester, vocabulary_builders

In [4]:
cats = categories()

qcatfile = open('question_category_train.csv', 'r')
qcatreader = csv.reader(qcatfile)

next(qcatreader) # skipping column discription

qcat_dict = {} # mapping from question_id to the parent category_id
cat_freq = nltk.FreqDist() # maps the category_id to its

for qcat in qcatreader:
    cat_id = int(qcat[1])
    pcat_id = cats.parent_id(cat_id)
    q_id = int(qcat[2])
    
    qcat_dict[q_id] = pcat_id
    cat_freq[cats.name(cat_id)] += 1
    
#cat_freq.plot()
#cat_freq.most_common(14)

In [5]:
importlib.reload(filters)

qfile = open('question_train.csv', 'r')
qreader = csv.reader(qfile)

qfile.seek(0);
next(qreader)

questions = []
vocabulary = {}
vocabulary['all'] = nltk.FreqDist()
for cat_name in cats.all_names(): vocabulary[cat_name] = nltk.FreqDist()

# Set this parameter to TRUE if you want to read through
# all questions again, elsewise from file set to FALSE.
NewRead = False 

total_enteties_count = 0

if NewRead:
    bar = progressbar.ProgressBar()
    
    for row in bar(qreader):
        if len(row) != 21: continue
        if row[15] != "0": continue
            
        if int(row[0]) in qcat_dict.keys():
            cat_id = qcat_dict[int(row[0])]
            
            sentence = row[4].lower()
            # running a sequence of filters on the raw question string 
            for filt in [filters.punctuation_filter]:
                sentence = filt(sentence)
            
            words = word_tokenize(sentence)
            # running a sequence of filtes on the already tokenized sentence
            for filt in [filters.year_tracker, filters.small_word_filter, filters.stemming_filter]:
                words = filt(words)
            
            questions.append( {"words":words, "cat":cats.name(cat_id)} )
            vocabulary[ cats.name(cat_id) ] += nltk.FreqDist(words)
            vocabulary['all'] += nltk.FreqDist(words)
        
    ## Saving into pickle files
    q_file, v_file = open('questions.pkl', 'wb'), open('vocabulary.pkl', 'wb')
    pickle.dump(questions, q_file)
    pickle.dump(vocabulary, v_file)
    
else:
    ## Loading from pickle files
    q_file, v_file = open('questions.pkl', 'rb'), open('vocabulary.pkl', 'rb')
    questions, vocabulary = pickle.load(q_file), pickle.load(v_file)

In [None]:
importlib.reload(vocabulary_tester)
importlib.reload(vocabulary_builders)
from vocabulary_builders import most_common, most_common_reduced, ig_based

corpus = [(q['words'], q['category']) for q in questions]

# Testing IG-based Vocabulary

In [None]:
vocab_builders = {}
for M in np.arange(50,501,50):
    name = "ig_based (M = {0})".format(M)
    vocab_builders[ name ] = (ig_based, {"frequencies": vocabulary,
                                         "cat_frequencies": cat_freq,
                                         "categories": cats.all_names(),
                                         "M": M,
                                         "read_from_file": True})

vocabularies = {}
for vb_name in vocab_builders.keys():
    vb, args = vocab_builders[vb_name]
    vocabularies[vb_name] = vb(**args)

vocabulary_tester.test(vocabularies, corpus, tr_set_size=10000, te_set_size=4000)

In [None]:

for M in np.arange(50,501,50):
    name = "ig_based (M = {0})".format(M)
    vocab_builders[ name ] = (ig_based, {"frequencies": vocabulary,
                                         "cat_frequencies": cat_freq,
                                         "categories": cats.all_names(),
                                         "M": M,
                                         "read_from_file": True})
    
    name = "most_common (M = {0})".format(M)
    vocab_builders[ name ] = (most_common, {"frequencies":vocabulary,
                                            "categories": cats.all_names(),
                                            "M": M})
    
    for S in np.arange(8,13):
        for MS in [50,100,150]:
            name = "most_common_reduced (M = {0}, S = {1}, MS = {2})".format(M,S,MS)
            vocab_builders[ name ] = (most_common_reduced, {"frequencies":vocabulary,
                                                                           "categories": cats.all_names(),
                                                                           "M": M})

vocabularies = {}
for vb_name in vocab_builders.keys():
    vb, args = vocab_builders[vb_name]
    vocabularies[vb_name] = vb(**args)
    print(vb_name,"done with M =",args["M"])

corpus = [(q['words'], q['category']) for q in questions]

most_common_reduced (M = 50, S = 12, MS = 50) done with M = 50


100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


ig_based (M = 500) done with M = 500
most_common_reduced (M = 400, S = 10, MS = 100) done with M = 400
most_common_reduced (M = 100, S = 8, MS = 150) done with M = 100
most_common_reduced (M = 350, S = 9, MS = 50) done with M = 350
most_common_reduced (M = 450, S = 9, MS = 50) done with M = 450
most_common_reduced (M = 50, S = 11, MS = 100) done with M = 50
most_common_reduced (M = 500, S = 12, MS = 100) done with M = 500
most_common_reduced (M = 450, S = 11, MS = 50) done with M = 450
most_common_reduced (M = 50, S = 9, MS = 150) done with M = 50
most_common_reduced (M = 500, S = 8, MS = 50) done with M = 500
most_common_reduced (M = 100, S = 9, MS = 100) done with M = 100
most_common_reduced (M = 350, S = 8, MS = 100) done with M = 350
most_common_reduced (M = 450, S = 12, MS = 150) done with M = 450
most_common_reduced (M = 150, S = 12, MS = 50) done with M = 150
most_common_reduced (M = 450, S = 11, MS = 150) done with M = 450
most_common_reduced (M = 500, S = 11, MS = 50) done wit

100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


ig_based (M = 200) done with M = 200
most_common_reduced (M = 350, S = 8, MS = 150) done with M = 350
most_common_reduced (M = 350, S = 9, MS = 100) done with M = 350
most_common_reduced (M = 400, S = 12, MS = 100) done with M = 400
most_common_reduced (M = 100, S = 11, MS = 50) done with M = 100
most_common_reduced (M = 500, S = 9, MS = 100) done with M = 500
most_common_reduced (M = 50, S = 8, MS = 50) done with M = 50
most_common_reduced (M = 50, S = 11, MS = 150) done with M = 50
most_common_reduced (M = 250, S = 12, MS = 100) done with M = 250
most_common_reduced (M = 350, S = 11, MS = 150) done with M = 350
most_common_reduced (M = 500, S = 11, MS = 100) done with M = 500
most_common_reduced (M = 150, S = 10, MS = 100) done with M = 150
most_common (M = 350) done with M = 350
most_common_reduced (M = 500, S = 10, MS = 100) done with M = 500
most_common_reduced (M = 500, S = 8, MS = 100) done with M = 500
most_common_reduced (M = 200, S = 10, MS = 150) done with M = 200


100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


ig_based (M = 300) done with M = 300
most_common (M = 200) done with M = 200
most_common_reduced (M = 300, S = 9, MS = 100) done with M = 300
most_common_reduced (M = 400, S = 9, MS = 150) done with M = 400
most_common_reduced (M = 200, S = 12, MS = 100) done with M = 200
most_common_reduced (M = 50, S = 12, MS = 150) done with M = 50
most_common_reduced (M = 350, S = 12, MS = 50) done with M = 350
most_common_reduced (M = 100, S = 11, MS = 100) done with M = 100
most_common_reduced (M = 250, S = 12, MS = 150) done with M = 250
most_common_reduced (M = 50, S = 9, MS = 50) done with M = 50
most_common_reduced (M = 300, S = 11, MS = 100) done with M = 300
most_common_reduced (M = 200, S = 11, MS = 100) done with M = 200
most_common_reduced (M = 200, S = 11, MS = 150) done with M = 200
most_common_reduced (M = 450, S = 8, MS = 150) done with M = 450
most_common_reduced (M = 150, S = 8, MS = 50) done with M = 150
most_common_reduced (M = 250, S = 11, MS = 100) done with M = 250
most_common

100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


ig_based (M = 100) done with M = 100


100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


ig_based (M = 400) done with M = 400
most_common_reduced (M = 350, S = 11, MS = 50) done with M = 350
most_common_reduced (M = 100, S = 8, MS = 100) done with M = 100
most_common_reduced (M = 150, S = 9, MS = 150) done with M = 150
most_common_reduced (M = 300, S = 12, MS = 100) done with M = 300
most_common_reduced (M = 50, S = 12, MS = 100) done with M = 50
most_common_reduced (M = 100, S = 11, MS = 150) done with M = 100
most_common_reduced (M = 500, S = 9, MS = 150) done with M = 500
most_common_reduced (M = 150, S = 11, MS = 50) done with M = 150
most_common_reduced (M = 250, S = 8, MS = 150) done with M = 250
most_common_reduced (M = 350, S = 12, MS = 100) done with M = 350
most_common_reduced (M = 350, S = 10, MS = 100) done with M = 350
most_common_reduced (M = 150, S = 9, MS = 100) done with M = 150


  0% ( 0 of 14) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--

most_common_reduced (M = 400, S = 8, MS = 100) done with M = 400


100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


ig_based (M = 150) done with M = 150
most_common_reduced (M = 150, S = 11, MS = 100) done with M = 150
most_common_reduced (M = 50, S = 8, MS = 150) done with M = 50
most_common_reduced (M = 50, S = 9, MS = 100) done with M = 50
most_common_reduced (M = 200, S = 9, MS = 100) done with M = 200
most_common_reduced (M = 300, S = 8, MS = 50) done with M = 300
most_common_reduced (M = 50, S = 8, MS = 100) done with M = 50
most_common_reduced (M = 300, S = 9, MS = 150) done with M = 300
most_common_reduced (M = 300, S = 8, MS = 150) done with M = 300


100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


ig_based (M = 350) done with M = 350
most_common_reduced (M = 100, S = 10, MS = 100) done with M = 100
most_common_reduced (M = 400, S = 10, MS = 50) done with M = 400
most_common_reduced (M = 250, S = 10, MS = 150) done with M = 250
most_common (M = 400) done with M = 400
most_common_reduced (M = 250, S = 11, MS = 50) done with M = 250
most_common_reduced (M = 200, S = 8, MS = 50) done with M = 200
most_common_reduced (M = 250, S = 9, MS = 150) done with M = 250
most_common (M = 100) done with M = 100
most_common_reduced (M = 250, S = 12, MS = 50) done with M = 250
most_common_reduced (M = 150, S = 8, MS = 150) done with M = 150
most_common_reduced (M = 350, S = 9, MS = 150) done with M = 350
most_common_reduced (M = 100, S = 12, MS = 50) done with M = 100
most_common_reduced (M = 400, S = 9, MS = 50) done with M = 400
most_common_reduced (M = 450, S = 9, MS = 150) done with M = 450


100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


ig_based (M = 250) done with M = 250
most_common_reduced (M = 250, S = 8, MS = 100) done with M = 250
most_common_reduced (M = 400, S = 11, MS = 100) done with M = 400
most_common_reduced (M = 500, S = 9, MS = 50) done with M = 500
most_common_reduced (M = 100, S = 12, MS = 100) done with M = 100
most_common_reduced (M = 100, S = 10, MS = 50) done with M = 100
