# Article Classifier

**By: Jennifer Lin**

Classifies imported articles as either politics or sports based on word associations and co-occurrences in the documents.

In [2]:
#reads the texts in the ps2data file
ptexts=[open('ps2data/p{}.txt'.format(i)).read() for i in range(30)]
stexts=[open('ps2data/s{}.txt'.format(i)).read() for i in range(30)]
testtexts=[open('ps2data/test{}.txt'.format(i)).read() for i in range(10)]

In [3]:
#separates all the text in an article into 1 word, lowercase, and put in a list
import re
re.findall("[a-z0-9']+",ptexts[0].lower())

In [4]:
#only counts each new word rather so don't recount words that appear multiple times in one document
import numpy as np
from collections import Counter, defaultdict
Counter(set(re.findall("[a-z0-9']+",ptexts[0].lower())))

In [5]:
#number of counts of words in the training set
n_p = np.sum([Counter(set(re.findall("[a-z0-9']+",txt.lower()))) for txt in ptexts])
n_s = np.sum([Counter(set(re.findall("[a-z0-9']+",txt.lower()))) for txt in stexts])

In [6]:
#top 10 words with most frequent appearances in both the political + sports documents, 30 means they appeared in all 30 documents
n_p.most_common(30), n_s.most_common(30)

In [7]:
#shows the 85th to 94th most frequently seen words in the political documents, appearing 17 times
n_s.most_common()[85:94]

In [18]:
def bayes_classifier(txt):
    #finds the word count in txt
    word_counts=Counter(re.findall("[a-z0-9']+",txt.lower()))
    
    keywords=[w for w,c in word_counts.most_common() if n_p[w]< 26 and n_s[w]<26][:30]
    top5 = '(based on "{} ...")'.format(', '.join(keywords[:5]))
    for word in keywords:
        #smooths data so words that haven't been seen before don't get disregarded
        if word not in n_p: n_p[word]=.5
        if word not in n_s: n_s[word]=.5
        
        #p(word|politics) and p(word|sports) with 30 political and sports documents each
        pwordprob = n_p[word]/30
        swordprob = n_s[word]/30

        #p(politics) and p(sports)
        politicprob = 30/60
        sportsprob = 30/60
        
        #finds p(politics|word)
        prob = ((pwordprob)*(politicprob))/((pwordprob)*(politicprob)+(swordprob)*(sportsprob))
        
        #if greater than 50%, most likely can be classified as political document
        if prob >= 0.5: return "politics", prob
        else: return "sports", prob
        
for i in range(10):
    print ('test{}.txt:'.format(i),bayes_classifier(testtexts[i]))

test0.txt: ('politics', 0.9333333333333333)
test1.txt: ('sports', 0.3333333333333333)
test2.txt: ('sports', 0.43181818181818177)
test3.txt: ('politics', 0.9677419354838709)
test4.txt: ('politics', 0.9230769230769231)
test5.txt: ('politics', 0.5)
test6.txt: ('sports', 0.25)
test7.txt: ('sports', 0.2)
test8.txt: ('sports', 0.3333333333333333)
test9.txt: ('politics', 0.6666666666666666)


In [9]:
# list of all co-occurrences
co=defaultdict(int)

#combines documents 
allDocs = ptexts+stexts
for txt in allDocs[0]:
    #searches unique co-occurrences per document
    words = set(re.findall("[a-z']+", txt.lower()))
    for w0 in words:
        for w1 in words:
           if w0 < w1: co[(w0,w1)] +=1
#all pairs of words and number of co-occurrences throughout all 60 documents
#list(co.items())

In [10]:
#check words that appear only 30 times across all documents
total = n_p + n_s
for key in total:
    if (total[key] == 30): 
        print(key, total[key])

way 30
make 30
say 30
why 30
off 30
where 30
need 30


In [11]:
#check words that appear only 15 times across all documents
for key in total:
    if (total[key] == 15): 
        print(key, total[key])

among 15
until 15
position 15
month 15
rather 15
administration 15
1 15
less 15
aren't 15
defense 15
either 15
show 15
5 15
football 15
man 15
night 15
advertising 15
sharp 15
true 15
forward 15
recent 15
fully 15
they're 15
journalists 15
online 15
field 15
action 15
high 15


In [12]:
#looks for pair of words that only co-occur 15 times but occur 30 times alone
for pair in co:
    if (total[pair[0]] == 30 and total[pair[1]] == 30 and co[pair] == 15): 
        print(pair, co[pair])

#### This identifies pairs of words that occur in exactly half the documents and their probability of co-occurrence is independent (i.e. occurring in 30 documents as individual words but co-occurring in exactly 15 documents).

In [13]:
#contains words that only appear 20 times (1/3 of the 60 documents)
co1 = []
#contains words that only appear 15 (1/4 of the 60 documents)
co2 = []
for key in total:
    if (total[key] == 20): 
      co1.append(key)
    if (total[key] == 15):
      co2.append(key)

#contains words that either appear 20 or 15 times but don't co-occur
disjoint = []
for key in co1:
    for word in co2:
        #if don't co-occur, number of co-occurrences for that pair is 0
        if (co[(key, word)] == 0 or co[(word,key)]==0): 
            disjoint.append((key, word))
print(disjoint)
    

[('believe', 'among'), ('believe', 'until'), ('believe', 'position'), ('believe', 'month'), ('believe', 'rather'), ('believe', 'administration'), ('believe', '1'), ('believe', 'less'), ('believe', "aren't"), ('believe', 'defense'), ('believe', 'either'), ('believe', 'show'), ('believe', '5'), ('believe', 'football'), ('believe', 'man'), ('believe', 'night'), ('believe', 'advertising'), ('believe', 'sharp'), ('believe', 'true'), ('believe', 'forward'), ('believe', 'recent'), ('believe', 'fully'), ('believe', "they're"), ('believe', 'journalists'), ('believe', 'online'), ('believe', 'field'), ('believe', 'action'), ('believe', 'high'), ('united', 'among'), ('united', 'until'), ('united', 'position'), ('united', 'month'), ('united', 'rather'), ('united', 'administration'), ('united', '1'), ('united', 'less'), ('united', "aren't"), ('united', 'defense'), ('united', 'either'), ('united', 'show'), ('united', '5'), ('united', 'football'), ('united', 'man'), ('united', 'night'), ('united', 'ad

#### This identifies pairs of words that occur in exactly 1/3 of the documents, the other in exactly 1/4 of the documents, and which do not co-occur.

In [14]:
#co-occur 12 times (1/5 of 60) but occur 30 (1/2 of 60) or 15 (1/4 of 60) times alone
for pair in co:
    if (total[pair[0]] == 30 and total[pair[1]] == 15 and co[pair] == 12): 
        print(pair)
    elif (total[pair[0]] == 15 and total[pair[1]] == 30 and co[pair] == 12):
        print(pair)

In [15]:
(1/5)>(1/2)*(1/4)

True

#### This identifies pairs of words with the first word occurring in 1/2 the documents, second word in 1/4 of the documents, and co-occurring in 1/5 of the documents. 

In [16]:
#looks for words that occur 40 times (2/3 of 60)
for key in total:
    if (total[key] == 40): 
        print(key, total[key])

then 40
these 40
its 40
most 40


In [17]:
for pair in co:
    #looks for words that occur 40 times (2/3 of 60) and co-occur 26.6667 +/- 1 (60 * 4/9)
    if (total[pair[0]] == 40 and total[pair[1]] == 40):
        if (co[pair] == 26 or co[pair] == 27): 
            print(pair)