In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue Sep  1 13:59:36 2022

@author: jinnie shin 
"""

import nltk 
import json 
import string 
import numpy as np 
import pandas as pd 
import get_text
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


def word_sentiment(files):
    T = ('').join(files)
    T = T.lower()
    test_subset = nltk.word_tokenize(T)
    sid = SentimentIntensityAnalyzer()
    pos_word_list=[]
    neu_word_list=[]
    neg_word_list=[]
    
    for word in test_subset:
        if (sid.polarity_scores(word)['compound']) >= 0.4:
            pos_word_list.append(word)
        elif (sid.polarity_scores(word)['compound']) <= -0.4:
            neg_word_list.append(word)
        else:
            neu_word_list.append(word)    
    
    return neu_word_list

def sentiment_weights(files):
    analyser = SentimentIntensityAnalyzer()
    chapters = files 
    chapter_sents = [nltk.sent_tokenize(i) for i in chapters]
    chapter_scores = []
    for i in chapter_sents:
        temp = [] 
        for j in i:
            temp_score = np.abs(analyser.polarity_scores(j)['compound'])
            temp.append(temp_score)
        chapter_scores.append(np.mean(temp))
        #chapter_scores.append(temp)
        
    neu_word_list = word_sentiment(files)
    
    # word-count vectors:
    sentiment_wgt_dict = []
    for chapter, chapter_score in zip(chapters, chapter_scores):
        vect = CountVectorizer(lowercase=True)
        X = vect.fit_transform(nltk.sent_tokenize(chapter))
        matrix = pd.DataFrame(X.A, columns=vect.get_feature_names())
        matrix['sentiment'] = chapter_score
        for i in matrix.columns:
            if i!='sentiment':
                matrix[i] = matrix[i] * matrix.sentiment
            else:
                matrix[i] = matrix[i]
                
        sentiment_wgt = matrix.drop(columns=['sentiment']).sum().to_dict()
        for i in sentiment_wgt.keys():
            if i in neu_word_list:
                sentiment_wgt[i] = 0 
            else:
                sentiment_wgt = sentiment_wgt
            
        sentiment_wgt_dict.append(sentiment_wgt) 
        
    return sentiment_wgt_dict
    

def lemmatized_words(doc):
    lemmatizer = WordNetLemmatizer()
    analyzer = CountVectorizer(lowercase=True,max_df=0.65,min_df=0.1,stop_words=stop_words).build_analyzer()
    return (lemmatizer.lemmatize(w) for w in analyzer(doc))

def weighted_vectorize(files): #Countvectorizer for the main model 
    """
    Sentiment weight matrix can be computed 
    and applied to the vectorization.  
    """ 
    sentiment_wgt_dict = sentiment_weights(files)
    
    # ======= Sentiment weight matrix has to be applied here ======= #
    
    #with open('sentiment_wgt.json', 'r') as t:
    #    sentiment_wgt_dict = json.load(t)
    #additional_stopword = word_sentiment(files) 
    
    # ===============================================================
    
    additional_stopword = [] #TO DO: Custom stopwords can be added. 
    
    stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopword)
    stop_words = text.ENGLISH_STOP_WORDS#.union(additional_stopword)
    vect = CountVectorizer(lowercase=True,max_df=0.352,min_df=0.1,stop_words=stop_words)
    X = vect.fit_transform(files)
    ivoc = {j:i for i,j in vect.vocabulary_.items()}
    vocab = {i for i,j in vect.vocabulary_.items()}
    
    weighted_X = []
    for i in range(0, X.shape[0]):
        x = X.toarray()[i]
        word_weights = sentiment_wgt_dict[i]
        feature_names = vect.get_feature_names()
        weights = np.ones(len(feature_names))
        for key, value in word_weights.items():
            try:
                index = feature_names.index(key)
                weights[index] = value *10
            except: 
                weights = weights
        weighted_X.append(np.multiply(x, weights))
    weighted_X = np.stack(weighted_X)
    
    return weighted_X, ivoc, vocab

# these are supposed to be considered the auxiliary LDA 
def show_topics(lda,ivoc,number_words=10,topics=range(10)):
    for k,topic in enumerate(lda.components_):
        if k in topics:
            print(k+1,[str(ivoc[i]) for i in topic.argsort()[::-1][:number_words]])
  
def print_top_words(model, feature_names, n_top_words):
    for topic_id, topic in enumerate(model.components_):
        
        print('\nTopic Number.%d:' % int(topic_id + 1)) 
        print(''.join([feature_names[i] + ' ' + str(round(topic[i], 2))
              +' | ' for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
def sample_window(seq, window_size = 500, stride = 20):
    """
    Generator slides a window across the input sequence 
    and returns samples; window size and stride define
    the context window
    """
    for pos in range(0, len(seq), stride):
        yield seq[pos : pos + window_size]
        
def merge(seq, stride = 4):
    """
    Generator strides across the input sequence, 
    combining the elements between each stride.
    """
    for pos in range(0, len(seq), stride):
        yield seq[pos : pos + stride]
        
def clean_text(text):
    temp=nltk.sent_tokenize(text)
    if temp[0].isupper():
        temp = temp
    else:
        temp = temp[1:]
        
    if temp[-1] in string.punctuation:
        temp = temp
    else:
        temp = temp[:-1]
    
    return ' '.join(temp)



