# Sentiment-LDA

In [1]:
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import sentiwordnet as swn
st = PorterStemmer()

In [2]:
MAX_VOCAB_SIZE = 50000

In [3]:
def sampleFromDirichlet(alpha):
    return np.random.dirichlet(alpha)

def sampleFromCategorical(theta):
    theta = theta / np.sum(theta)
    return np.random.multinomial(1, theta).argmax()

def word_indices(wordOccurenceVec):
    for idx in wordOccurenceVec.nonzero()[0]:
        for i in range(int(wordOccurenceVec[idx])):
            yield idx

In [4]:
class SentimentLDAGibbsSampler:
    
    def __init__(self, numTopics, alpha, beta, gamma, numSentiments=2):
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.numTopics = numTopics
        self.numSentiments = numSentiments
        
    def processSingleReview(self, review, d=None):
        letters_only = re.sub('[^a-zA-Z]', ' ', review)
        words = letters_only.lower().split()
        stops = set(stopwords.words('english'))
        meaningful_words = [st.stem(w) for w in words if w not in stops]
        return ' '.join(meaningful_words)
    
    def processReviews(self, reviews, saveAs=None, saveOverride=False):
        import os
        import dill
        if not saveOverride and saveAs and os.path.isfile(saveAs):
            [wordOccurenceMatrix, self.vectorizer] = dill.load(open(saveAs, 'r'))
            return wordOccurenceMatrix
        processed_reveiws = []
        for i, review in enumerate(reviews):
            if (i + 1) % 1000 == 0:
                print(' Review {} of {}'.format(i + 1, len(reviews)))
            processed_reviews.append(self.processSingleReview(review, i))
        self.vectorizer = CountVectorizer(analyzer='word',
                                          tokenizer=None,
                                          preprocessor=None,
                                          stop_words='english',
                                          max_features=MAX_VOCAB_SIZE)
        train_data_features = self.vectorizer.fit_transform(processed_reviews)
        wordOccurenceMatrix = train_data_features.toarray()
        if saveAs:
            dill.dump([wordOccurenceMatrix, self.vectorizer], open(saveAs, 'w'))
        return wordOccurenceMatrix
    
    def _initialize_(self, reviews, saveAs=None, saveOverride=False):
        self.wordOccurenceMatrix = self.processReviews(reviews, saceAs, saveOverride)
        numDocs, vocabSize = self.wordOccurenceMatrix.shape
        
        # Pseudocounts
        self.n_dt = np.zeros((numDocs, self.numTopics))
        self.n_dfs = np.zeros((numDocs, self.numTopics, self.numSentiments))
        self.n_d = np.zeros((numDocs))
        self.n_vts = np.zeros((vocabSize, self.numTopics, self.numSentiments))
        self.n_ts = np.zeros((self.numTopics, self.numSentiments))
        self.topics = {}
        self.sentiments = {}
        self.priorSentiment = {}
        
        alphaVec = self.alpha * np.ones(self.numTopics)
        gammaVec = self.gamma * np.ones(self.numSentiments)
        
        # 감정 사전 (SentiwordNet)을 사용하여 사전 확률을 조작 중.
        for i, word in enumerate(self.vectorizer.get_feature_names()):
            synsets = swn.senti_synsets(word)
            posScore = np.mean([s.pos_score() for s in synsets])
            negScore = np.mean([s.neg_score() for s in synsets])
            if posScore >= 0.1 and posScore > negScore:
                self.priorSentiment[i] = 1
            elif negScore >= 0.1 and negScore > posScore:
                self.priorSentiment[i] = 0
        
        for d in range(numDocs):
            topicDistribution = sampleFromDirichlet(alphaVec)
            sentimentDistribution = np.zeros((self.numTopics, self.numSentiments))
            for t in range(self.numTopics):
                sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)
            for i, w in enumerate(word_indices(self.wordOccurenceMatrix[d, :])):
                t = sampleFromCategorical(topicDistribution)
                s = sampleFromCategorical(sentimentDistribution[t, :])
                
                self.topics[(d, i)] = t
                self.sentiments[(d, i)] = s
                self.n_dt[d, t] += 1
                self.n_dts[d, t, s] += 1
                self.n_d[d] += 1
                self.n_vts[w, t, s] += 1
                self.n_ts[t, s] += 1
                
    def conditionalDistribution(self, d, v):
        probabilites_ts = np.ones((self.numTopics, self.numSentiments))
        firstFactor = (self.n_dt[d] + self.alpha) / \
                (self.n_d[d] + self.numTopics * self.alpha)
        secondFactor = (self.n_dts[d, :, :] + self.gamma) / \
                (self.n_dt[d, :] + self.numSentiments * self.gamma)[:, np.newaxis]
        thirdFactor = (self.n_vts[v, :, :] + self.beta) / \
                (self.n_ts + self.n_vts.shape[0] * self.beta)
        probabilites_ts *= firstFactor[:, np.newaxis]
        probabilites_ts *= secondFactor * thirdFactor
        probabilites_ts /= np.sum(probabilites_ts)
        return probabilites_ts
    
#     def getTopKWordsByLikelihood(self, K):
#         pseudocounts = np.copy(self.n_vts)
#         normalizer = np.summ(pseudocounts, (1, 2))
#         pseudocounts /= normalizer[:, np.newaxis, np.newaxis]
#         for t in range(self.numTopics):
#             for s in range(self.numSentiments):
#                 topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K+1):-1]
#                 vocab = self.vectorizer.get_feature_names()
#                 print(t, s, [vocab[i] for i in topWordIndices])
                
#     def getTopWords(self, K):
#         pseudocounts = np.copy(self.n_vts)
#         normalizer = np.summ(pseudocounts, (0))
#         pseudocounts /= normalizer[np.newaxis, :, :]
#         for t in range(self.numTopics):
#             for s in range(self.numSentiments):
#                 topWordIndices = pseudocounts[:, t, s].argsort()[-1:-(K+1):-1]
#                 vocab = self.vectorizer.get_feature_names()
#                 print(t, s, [vocab[i] for i in topWordIndices])
                
    def run(self, reviews, maxIters=30, saveAs=None, saveOverride=False):
        self._initialize_(reviews, saveAs, saveOverride)
        numDocs, vocabSize = self.wordOccurenceMatrix.shape
        for iteration in range(maxIters):
            print('Starting iteration {} of {}'.format(iteration + 1, maxIters))
            for d in range(numDocs):
                for i, v in enumerate(word_indices(self.wordOccurenceMatrix[d, :])):
                    t = self.topics[(d, i)]
                    s = self.sentiments[(d, i)]
                    self.n_dt[d, t] -= 1
                    self.n_d[d] -= 1
                    self.n_dts[d, t, s] -= 1
                    self.n_vts[v, t, s] -= 1
                    self.n_ts[t, s] -= 1
                    
                    probabilites_ts = self.conditionalDistribution(d, v)
                    if v in self.priorSentiment:
                        s = self.priorSentiment[v]
                        t = sampleFromCategorical(probabilites_ts[:, s])
                    else:
                        ind = sampleFromCategorical(probabilites_ts.flatten())
                        t, s = np.unravel_index(ind, probabilites_ts.shape)
                    
                    self.topics[(d, i)] = t
                    self.sentiments[(d, i)] = s
                    self.n_dt[d, t] += 1
                    self.n_d[d] += 1
                    self.n_dts[d, t, s] += 1
                    self.n_vts[v, t, s] += 1
                    self.n_ts[t, s] += 1

In [5]:
[i.neg_score() for i in swn.senti_synsets('sadness')]

[0.75, 0.625, 0.875]

In [6]:
import pandas as pd
df = pd.read_csv('abcnews-date-text-Copy1.csv')

In [7]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [8]:
df.shape

(1103663, 2)

In [9]:
df = df.sample(5000)

In [10]:
reviews = list(df['headline_text'].values)

`self.__init__`

In [11]:
# __init__
numTopics = 4
alpha = 1
beta = 1
gamma = 1
numSentiments=8

`self.run`

>`self._initialize`

>> `self.processReviews`

In [12]:
import os
# 한국어 stopwords는 없네여..
os.listdir('C:\\Users\\Affinity\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'.replace('\\', '/'))

['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'README',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [13]:
def processSingleReview(review, d=None):
    letters_only = re.sub('[^a-zA-Z]', ' ', review)
    words = letters_only.lower().split()
    stops = set(stopwords.words('english'))
    meaningful_words = [st.stem(w) for w in words if w not in stops]
    return ' '.join(meaningful_words)
    
    
st = PorterStemmer() # 정밀하게 설계되어 정확도가 높음
                     # 영어 자연어 처리에서 어간 추출을 하고자 할 때 가장 준수한 선택
processed_reviews = []
for i, review in enumerate(reviews):
    if (i+1) % 100 == 0:
        print('Review {} of {}'.format(i+1, len(reviews)))
    processed_reviews.append(processSingleReview(review, i))

Review 100 of 5000
Review 200 of 5000
Review 300 of 5000
Review 400 of 5000
Review 500 of 5000
Review 600 of 5000
Review 700 of 5000
Review 800 of 5000
Review 900 of 5000
Review 1000 of 5000
Review 1100 of 5000
Review 1200 of 5000
Review 1300 of 5000
Review 1400 of 5000
Review 1500 of 5000
Review 1600 of 5000
Review 1700 of 5000
Review 1800 of 5000
Review 1900 of 5000
Review 2000 of 5000
Review 2100 of 5000
Review 2200 of 5000
Review 2300 of 5000
Review 2400 of 5000
Review 2500 of 5000
Review 2600 of 5000
Review 2700 of 5000
Review 2800 of 5000
Review 2900 of 5000
Review 3000 of 5000
Review 3100 of 5000
Review 3200 of 5000
Review 3300 of 5000
Review 3400 of 5000
Review 3500 of 5000
Review 3600 of 5000
Review 3700 of 5000
Review 3800 of 5000
Review 3900 of 5000
Review 4000 of 5000
Review 4100 of 5000
Review 4200 of 5000
Review 4300 of 5000
Review 4400 of 5000
Review 4500 of 5000
Review 4600 of 5000
Review 4700 of 5000
Review 4800 of 5000
Review 4900 of 5000
Review 5000 of 5000


In [14]:
processed_reviews[:10]

['forest plan delay blame conservationist',
 'sa dam search come empti',
 'mp give tick pulp mill tour',
 'vicroad revers stanc remembr day',
 'farmer group welcom new wheat varieti',
 'southern star nz',
 'hundr free subsidis train cours',
 'indigen youngster retrac step ancestor',
 'journalist defend hack media inquiri',
 'eagl lamb loss']

In [15]:
len(processed_reviews)

5000

In [16]:
vectorizer = CountVectorizer(analyzer='word',
                             tokenizer=None,
                             preprocessor=None,
                             stop_words='english',
                             max_features=MAX_VOCAB_SIZE)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=50000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [17]:
train_data_features = vectorizer.fit_transform(processed_reviews)

In [18]:
train_data_features

<5000x6249 sparse matrix of type '<class 'numpy.int64'>'
	with 24703 stored elements in Compressed Sparse Row format>

In [301]:
wordOccurenceMatrix = train_data_features.toarray()
print(wordOccurenceMatrix, wordOccurenceMatrix.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] (5000, 6267)


In [365]:
wordOccurenceMatrix[0, :].nonzero()[-1]

array([1633, 2486, 2687], dtype=int64)

>> 가로 닫고~!

In [302]:
numDocs, vocabSize = wordOccurenceMatrix.shape
print(numDocs, vocabsize)

5000 2421


In [303]:
# pseudocounts
n_dt = np.zeros((numDocs, numTopics))
n_dts = np.zeros((numDocs, numTopics, numSentiments))
n_d = np.zeros((numDocs))
n_vts = np.zeros((vocabSize, numTopics, numSentiments))
n_ts = np.zeros((numTopics, numSentiments))
topics = {}
sentiments = {}
priorSentiment = {}

alphaVec = alpha * np.ones(numTopics)
gammaVec = gamma * np.ones(numSentiments)

n_dt.shape, n_dts.shape, n_d.shape, n_vts.shape, n_ts.shape, alphaVec.shape, gammaVec.shape

((5000, 4), (5000, 4, 8), (5000,), (6267, 4, 8), (4, 8), (4,), (8,))

In [304]:
len(vectorizer.get_feature_names())

6267

In [305]:
for i, word in enumerate(vectorizer.get_feature_names()):
    synsets = swn.senti_synsets(word)
    posScore = np.mean([s.pos_score() for s in synsets])
    negScore = np.mean([s.neg_score() for s in synsets])
    if np.isnan(posScore): posScore = 0
    if np.isnan(negScore): negScore = 0
    if posScore >= 0.1 and posScore > negScore:
        priorSentiment[i] = 1
    elif negScore >= 0.1 and negScore > posScore:
        priorSentiment[i] = 0

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [306]:
for d in range(numDocs):
    topicDistribution = sampleFromDirichlet(alphaVec)
    sentimentDistribution = np.zeros((numTopics, numSentiments))
    for t in range(numTopics):
        sentimentDistribution[t, :] = sampleFromDirichlet(gammaVec)
    for i, w in enumerate(word_indices(wordOccurenceMatrix[d, :])):
        t = sampleFromCategorical(topicDistribution)
        s = sampleFromCategorical(sentimentDistribution[t, :])
        
        topics[(d, i)] = t
        sentiments[(d, i)] = s
        n_dt[d, t] += 1
        n_dts[d, t, s] += 1
        n_d[d] += 1
        n_vts[w, t, s] += 1
        n_ts[t, s] += 1

> 가로 닫고~!

In [308]:
numDocs, vocabSize = wordOccurenceMatrix.shape

In [310]:
maxIters = 30

In [341]:
def conditionalDistribution(d, v):
    probabilities_ts = np.ones((numTopics, numSentiments))
    firstFactor = (n_dt[d] + alpha) / \
        (n_d[d] + numTopics * alpha)
    secondFactor = (n_dts[d, :, :] + gamma) / \
        (n_dt[d, :] + numSentiments * gamma)[:, np.newaxis]
    thirdFactor = (n_vts[v, :, :] + beta) / \
        (n_ts + n_vts.shape[0] * beta)
    probabilities_ts *= firstFactor[:, np.newaxis]
    probabilities_ts *= secondFactor * thirdFactor
    probabilities_ts /= np.sum(probabilities_ts)
    return probabilities_ts

for iteration in range(maxIters):
    print('Starting iteration {} of {}'.format(iteration + 1, maxIters))
    for d in range(numDocs):
        for i, v in enumerate(word_indices(wordOccurenceMatrix[d, :])):
            t = topics[(d, i)]
            s = sentiments[(d, i)]
            n_dt[d, t] -= 1
            n_d[d] -= 1
            n_dts[d, t, s] -= 1
            n_vts[v, t, s] -= 1
            n_ts[t, s] -= 1

            probabilities_ts = conditionalDistribution(d, v)
            if v in priorSentiment:
                s = priorSentiment[v]
                t = sampleFromCategorical(probabilities_ts[:, s])
            else:
                print('here')
                ind = sampleFromCategorical(probabilities_ts.flatten())
                t, s = np.unravel_index(ind, probabilities_ts.shape)

            topics[(d, i)] = t
            sentiments[(d, i)] = s
            n_dt[d, t] += 1
            n_d[d] += 1
            n_dts[d, t, s] += 1
            n_vts[v, t, s] += 1
            n_ts[t, s] += 1

Starting iteration 1 of 30
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
her

here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


KeyboardInterrupt: 