In [1]:
import csv
import re
import numpy as np
import nltk
import itertools
import pickle
from categories import categories
import filters
import importlib

from nltk import word_tokenize

In [2]:
qfile = open('question_train.csv', 'r')
qreader = csv.reader(qfile)

qfile.seek(0);
#cols = list(zip(np.arange(21),next(reader)))
j = 0
for i, q in zip(np.arange(21), next(qreader)):
    if j == 3: l = "\n"; j = 0;
    else: l = ""; j += 1
        
    print('{0:2}: {1:25}'.format(i,q), end=l)

 0: question_id               1: user_id                   2: sms_guru_id               3: category_main_id         
 4: question                  5: description               6: tags                      7: categories               
 8: url                       9: rating_count_positive    10: rating_count_negative    11: answer_count             
12: reported                 13: answered                 14: active                   15: deleted                  
16: seo_locked               17: editor_locked            18: editor_id                19: created_at               
20: updated_at               

# Reading out questions and tokenizing, checking vocabulary

In [39]:
cats = categories()

qcatfile = open('question_category_train.csv', 'r')
qcatreader = csv.reader(qcatfile)

next(qcatreader) # skipping column discription

qcat_dict = {} # mapping from question_id to the parent category_id
cat_freq = nltk.FreqDist() # maps the category_id to its

for qcat in qcatreader:
    cat_id = int(qcat[1])
    pcat_id = cats.parent_id(cat_id)
    q_id = int(qcat[2])
    
    qcat_dict[q_id] = pcat_id
    cat_freq[cats.name(cat_id)] += 1
    
cat_freq.plot()

In [13]:
importlib.reload(filters)

qfile = open('question_train.csv', 'r')
qreader = csv.reader(qfile)

qfile.seek(0);
next(qreader)

questions = []
vocabulary = {}
vocabulary['all'] = nltk.FreqDist()
for cat_name in cats.all_names(): vocabulary[cat_name] = nltk.FreqDist()

# Set this parameter to TRUE if you want to read through
# all questions again, elsewise from file set to FALSE.
NewRead = False 

if NewRead:
    for row in qreader:
        if len(row) == 21:
            if int(row[0]) in qcat_dict.keys():
                cat_id = qcat_dict[int(row[0])]
                
                sentence = row[4].lower()
                # running a sequence of filters on the raw question string 
                for filt in [filters.punctuation_filter]:
                    sentence = filt(sentence)
                
                words = word_tokenize(sentence)
                # running a sequence of filtes on the already tokenized sentence
                for filt in [filters.year_tracker, filters.small_word_filter, filters.stemming_filter]:
                    words = filt(words)
                
                questions += [{"words": words, "cat_id": cat_id}]
                vocabulary[ cats.name(cat_id) ] += nltk.FreqDist(words)
                vocabulary['all'] += nltk.FreqDist(words)
        
    ## Saving into pickle files
    q_file, v_file = open('questions.pkl', 'wb'), open('vocabulary.pkl', 'wb')
    pickle.dump(questions, q_file)
    pickle.dump(vocabulary, v_file)
    
else:
    ## Loading from pickle files
    q_file, v_file = open('questions.pkl', 'rb'), open('vocabulary.pkl', 'rb')
    questions, vocabulary = pickle.load(q_file), pickle.load(v_file)

In [43]:
print(cats.all_names())
#print(vocabulary['computer_and_pc'].most_common(20))
#print(list(vocabulary['computer_and_pc'])[:20])

['film_and_musik', 'stars_and_promis', 'computer_and_pc', 'alltag', 'namensforschung', 'literatur_and_sprache', 'schule', 'mensch_and_koerper', 'freizeit_and_sport', 'wissen', 'liebe_and_beziehung', 'astrologie', 'games_and_spiele', 'adult']


In [62]:
import progressbar
information_gain = {}

bar = progressbar.ProgressBar()
for w in bar(vocabulary['all']):
    for cat in cats.all_names():
        information_gain[(w, cat)] = 0
        if vocabulary[cat].freq(w) > 0:
            information_gain[(w, cat)] = vocabulary[cat].freq(w) * cat_freq.freq(cat) \
                                        * np.log( vocabulary[cat].freq(w) / vocabulary['all'].freq(w) )
        if vocabulary[cat].freq(w) < 1:
            information_gain[(w, cat)] += (1 - vocabulary[cat].freq(w)) * cat_freq.freq(cat) \
                                        * np.log( (1 - vocabulary[cat].freq(w)) / (1 -vocabulary['all'].freq(w)) )

100% (14640 of 14640) |###################| Elapsed Time: 0:05:18 Time: 0:05:18


In [111]:
import progressbar
# ten best words by information_gain
vocab_ig = set()

bar = progressbar.ProgressBar()
for cat in bar(cats.all_names()):
    lis = [(information_gain[(w,cat)], w) for w in vocabulary['all']]
    lis.sort()
    vocab_ig = vocab_ig.union([w for _, w in lis[-100:]])
    
print(len(vocab_ig))

feature_set_ig = [(simple_features(vocab_ig, q), cats.name(q['cat_id'])) for q in questions]
train_set_ig, test_set_ig = feature_set_ig[:10000], feature_set_ig[10000:]
classifier_ig = nltk.NaiveBayesClassifier.train(train_set_ig)

print('classifier done')

100% (14 of 14) |#########################| Elapsed Time: 0:00:00 Time: 0:00:00


794
classifier done


In [112]:
nltk.classify.accuracy(classifier_ig, test_set_ig)

0.5921231326392032

In [10]:
def simple_features(vocab, question):
    features = {}
    for v in vocab:
        features[v] = v in question['words']
    return features

In [113]:
M = 150 # how much of the most common words one should take
vocab = set()
for cat_name in cats.all_names():
    words = [w for w,_ in vocabulary[cat_name].most_common(M)]
    vocab = vocab.union(words)

# words that appear in the most common words of more the #x=10 categories
# are denoted as stoppwords.
stopwords = set()
for cat_names in itertools.combinations( cats.all_names(), 10):
    cat_names = iter(cat_names)
    sub_stops = set([w for w, f in vocabulary[next(cat_names)].most_common(M)])
    for cat_name in cat_names:
        sub_stops = sub_stops.intersection( set([w for w, f in vocabulary[cat_name].most_common(M)]) )
    stopwords = stopwords.union(sub_stops)

#vocab = vocab.symmetric_difference(stopwords)

training_set_size, test_set_size = 5000, 1000

feature_set = [(simple_features(vocab, q), cats.name(q['cat_id'])) for q in questions]
train_set, test_set = feature_set[:training_set_size], feature_set[test_set_size:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [98]:
len(vocab), len(stopwords)

(712, 63)

In [114]:
nltk.classify.accuracy(classifier, test_set)

KeyboardInterrupt: 

In [18]:
res = [classifier.classify(q) for q, _ in test_set]
indeed = [c for _, c in test_set]

cm = nltk.ConfusionMatrix(indeed, res)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=12))

                      |                                                              l                      |
                      |                                                              i                      |
                      |                                                l             t                      |
                      |                                  f      m      i             e                      |
                      |                                  r      e      e             r                      |
                      |                    s             e      n      b             a                      |
                      |             n      t             i      s      e             t                      |
                      |             a      a             z      c      _      f      u                      |
                      |             m      r             e      h      a      i      r                      |
          

# Checking Vocabulary

In [22]:
totNum = vocabulary.N()
vocNum = vocabulary.B()
print("""The total number of words is {0}. 
The volume of the vocabulary is {1}.
That makes an percentiage of {2:.2}""".format(totNum,vocNum,vocNum/totNum))

The total number of words is 126685. 
The volume of the vocabulary is 18236.
That makes an percentiage of 0.14


In [75]:
allwordlist = [w for q in questions for w in q[1]]
fd = nltk.FreqDist(allwordlist)

In [102]:
n = 50
delta = 0
rows = 5
epc = int(n/rows)
a = np.arange(n) + delta

for j in range(epc):
    row = [fd.most_common(n)[i] for i in a[j::epc]]
    for v in row:
        print(("{0:>10} x {1:<6}").format(*v),end="")
    print("")

         ? x 11576   bedeutet x 1465       kommt x 947           '' x 649         oder x 506   
       der x 4693         das x 1386          wo x 879       welche x 642         mein x 481   
       wie x 3999         hat x 1351         wer x 810         gibt x 593          bei x 478   
       was x 3919         von x 1260        eine x 781          den x 587         viel x 455   
       ist x 3470         ein x 1135         mit x 769           er x 574         wenn x 452   
       die x 2366          es x 1112        kann x 749          für x 562         welt x 430   
       und x 2255         man x 1075       woher x 728        heißt x 554          sie x 423   
       ich x 2011           , x 1061         auf x 701         sind x 548        haben x 415   
        in x 1717           . x 1004       warum x 687           im x 542        viele x 415   
      name x 1652        wann x 997           am x 683           zu x 530          aus x 380   


In [104]:
vocabArr = sorted(list(vocab))

n = 50
delta = 2000
rows = 5
epc = int(n/rows)
a = np.arange(n) + delta

for j in range(epc):
    row = [vocabArr[i] for i in a[j::epc]]
    print(("- {:20}"*len(row)).format(*row))

- anschlag            - anstecken           - antikatalytische    - antonin             - antwortetst         
- anschliesend        - ansteckend          - antike              - antonio             - anubis              
- anschließen         - anstellen           - antimaterie         - antony              - anunis              
- anschlägt           - antarktis           - antipinoxe          - antreten            - anus                
- anschrift           - anteil              - antisemetismus      - antrieb             - anwalt              
- anschwillt          - antenne             - antisemitismus      - antrittsvorlesung   - anwendungen         
- ansehen             - antennenverhältnis  - antje               - anträge             - anwendungsgebiete   
- ansprechen          - anthony             - anton               - antwort             - anwesen             
- anspruch            - antibabypille       - antonella           - antworten           - anwesend            
-

# Playing area

In [125]:
import re
for q in questions[:10]:
    print([w for w in q[1] if re.search('^W',w)])

['Was']
['WIEVIEL']
['Was']
[]
[]
['Wie']
['Wie']
['Wie']
[]
['Was']


In [21]:
for x in range(1,11):
    print(repr(x).zfill(5), repr(x**2).ljust(5),repr(x**3).ljust(5))

00001 1     1    
00002 4     8    
00003 9     27   
00004 16    64   
00005 25    125  
00006 36    216  
00007 49    343  
00008 64    512  
00009 81    729  
00010 100   1000 


In [33]:
from nltk.corpus import names

In [42]:
type(names.words('male.txt'))

list

# Helps

In [128]:
a = np.arange(100)
[b[a[::3]-i] for i in range(3)]

[array([ 0,  3,  6,  9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48,
        51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99]),
 array([99,  2,  5,  8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47,
        50, 53, 56, 59, 62, 65, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95, 98]),
 array([98,  1,  4,  7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46,
        49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97])]

In [141]:
a = ['car', 'house', 'dor']
b = ['Cat', 'Dog', 'Mouse']
list(zip(*[a,b]))

[('car', 'Cat'), ('house', 'Dog'), ('dor', 'Mouse')]

# checking for ill-shaped lines

In [6]:
csvfile.seek(0);
i = 0 #count total row number
j = 0 #count false row number

for row in reader:
    i += 1
    if(len(row) != 21):
        #print('line size error in line {} \n'.format(reader.line_num))
        #print('line size is: {} \n'.format(len(row)))
        #print(*row)
        #print('\n')
        j += 1

print("""Total number of rows: {0}
Number of rows with wrong length: {1}:
    fraction: {2:.2f} %""".format(i,j,(j/i)*100))

Total number of rows: 16070
Number of rows with wrong length: 57:
    fraction: 0.35 %


In [33]:
re.findall(" {2,}", "Paul      Hager  er      hat  kein Geld.")

['      ', '  ', '      ', '  ']

In [34]:
lower("dfsdfadSADSFDF")

NameError: name 'lower' is not defined

In [82]:
from nltk import snowball

In [83]:
stemmer = snowball.GermanStemmer()

In [148]:
importlib.reload(filters)
sentence = "Hallo mein Name ist Paul ich bin 1992 gebohren"
filters.year_tracker(word_tokenize(sentence))

['Hallo',
 'mein',
 'Name',
 'ist',
 'Paul',
 'ich',
 'bin',
 'jahreszahl',
 'gebohren']

In [144]:
re.sub(r"^[1][89][0-9]{2}$","jahreszahl", "1993")

'jahreszahl'