# 파이썬으로 텍스트 

### 텍스트로 단어 벡터 만들기

In [16]:
from numpy import * 

def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                  ['maybe', 'not', 'take', 'him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthless', 'garbage'],
                  ['mr','licks','ate','my','steak','how','to','stop','him'],
                  ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1] # 1: 폭력적인, 0: 폭력적이지 않음
    return postingList, classVec

def createVocaList(dataSet): # 유일한 단어목록 생성
    vocabSet = set([]) # 빈 set 생성
    for document in dataSet:
        vocabSet = vocabSet | set(document) # vocaSet에 단어 붙이기
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet): # 어휘 목록에 있는 단어가 존재하는지 확인
    returnVec = [0]*len(vocabList) # 단어 개수만큼 0으로 vector 생성
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 # 단어가 어휘목록에 있으면 1
        else: print "the word : %s is not in my Vocabulary!" % word
    return returnVec

In [11]:
listOPosts, listClasses = loadDataSet()
myVocabList = createVocaList(listOPosts)
print myVocabList

['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my']


In [14]:
print setOfWords2Vec(myVocabList, listOPosts[0])

[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1]


In [15]:
print setOfWords2Vec(myVocabList, listOPosts[3])

[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]


### 단어 벡터로 확률 계산 

In [36]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = ones(numWords); p1Num = ones(numWords)
    p0Denom = 2.0; p1Denom = 2.0
    
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    
    p1Vect = log(p1Num / p1Denom) # log() 로 변경
    p0Vect = log(p0Num / p0Denom) # log() 로 변경
    return p0Vect, p1Vect, pAbusive

In [39]:
listOPosts, listClasses = loadDataSet()

In [42]:
myVocabList = createVocaList(listOPosts)

In [43]:
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

In [44]:
print len(trainMat); print listOPosts

6
[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]


In [45]:
p0V, p1V, pAb = trainNB0(trainMat, listClasses)

In [46]:
pAb # 폭력적인 단어가 있는 문서의 확률

0.5

In [47]:
p0V

array([-2.56494936, -2.56494936, -2.56494936, -3.25809654, -3.25809654,
       -2.56494936, -2.56494936, -2.56494936, -3.25809654, -2.56494936,
       -2.56494936, -2.56494936, -2.56494936, -3.25809654, -3.25809654,
       -2.15948425, -3.25809654, -3.25809654, -2.56494936, -3.25809654,
       -2.56494936, -2.56494936, -3.25809654, -2.56494936, -2.56494936,
       -2.56494936, -3.25809654, -2.56494936, -3.25809654, -2.56494936,
       -2.56494936, -1.87180218])

### 실제 조건반영을 위한 분류기 수정

In [52]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify*p1Vec) + log(pClass1)
    p0 = sum(vec2Classify*p0Vec) + log(1.0-pClass1)
    
    if p1>p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocaList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat), array(listClasses))
    testEntry = ['love','my','dalmation'] # test data
    
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) # test 결과
    print thisDoc
    print testEntry, 'classified as : ', classifyNB(thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid', 'garbage'] # test data
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print thisDoc
    print testEntry, 'classified as : ', classifyNB(thisDoc, p0V, p1V, pAb)
    

In [53]:
testingNB()

[0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
['love', 'my', 'dalmation'] classified as :  0
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
['stupid', 'garbage'] classified as :  1


### 중복 단어 문서 모델

In [54]:
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

# 스팸 이메일 분류

In [57]:
import re
mySent = "This book is the best book on Python or M.L. I have ever laid eyes upon."
regEx = re.compile('\\W*')
listOfTokens = regEx.split(mySent)

[tok.lower() for tok in listOfTokens if len(tok) > 0] # 소문자로 나타내면서 길이가 0 보다큰 단어만

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [58]:
emailText = open('email/ham/6.txt').read()
listOfTokens = regEx.split(emailText)

In [62]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList=[]; classList = []; fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    
    vocabList = createVocaList(docList)
    trainingSet = range(50); testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print "the error rate is : ", float(errorCount) / len(testSet)

In [69]:
spamTest()

the error rate is :  0.0


# 개인광고에 포함된 지역 특색 도출하기

In [70]:
import feedparser
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')

###### 자주 발생하는 단어 제거

In [73]:
def calcMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token]=fullText.count(token)
    sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) 
    return sortedFreq[:30]       

def localWords(feed1,feed0):
    import feedparser
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocaList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = range(2*minLen); testSet=[]           #create test set
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ',float(errorCount)/len(testSet)
    return vocabList,p0V,p1V

In [75]:
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
vocabList, pSF, pNY = localWords(ny,sf)

the error rate is:  0.5


# 분석 : 지역적으로 사용되는 단어 표현

In [76]:
def getTopWords(ny,sf):
    import operator
    vocabList, p0V, p1V = localWords(ny,sf)
    topNY = []; topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0: topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0: topNY.append((vocabList[i], p1V[i]))
    
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"
    for item in sortedSF:
        print item[0]
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"
    for item in sortedNY:
        print item[0]

In [77]:
getTopWords(ny,sf)

the error rate is:  0.35
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
guy
hello
some
nice
friends
enjoy
love
work
over
not
don
reading
job
come
decent
fun
off
other
female
time
having
here
others
great
club
join
cuddle
male
want
years
before
our
first
little
white
queer
earth
find
please
email
however
been
women
hoping
partner
person
hiking
bit
walk
night
people
santa
working
ladies
down
cultures
but
single
chat
book
really
all
chinese
month
children
send
mistakes
include
woman
mainstream
very
putting
asian
video
link
odd
even
what
giving
deeply
indian
learned
full
never
appeara
met
along
healthy
shift
social
usually
rafael
members
family
clara
county
ask
two
live
symphony
more
company
insights
hour
movies
learn
dirnks
give
share
hilton
pond
how
sunday
occasionally
intelligent
staying
funny
light
responsibility
pleasure
talk
minded
exclusive
disabled
committed
hell
still
perfect
anonymously
fit
late
willing
them
seeking
auto
practice
veggie
lunch
name
opera
raised
everyone
