In [None]:
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import nltk, string
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
nltk.download('vader_lexicon')

In [None]:
POSTIVE_WORDS_FILEPATH = "../../data/positive-words.txt"
NEGATIVE_WORDS_FILEPATH = "../../data/negative-words.txt"

## Sentiment Anaylsis - NLTK

### Step 1: Tokenization

In [None]:
def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))

def tokenize_chat_data(chat_dictionary, sentence=False):
    tokenized_chat_message_sentence = []
    tokenized_chat_message_words = []
    
    for name, message in chat_dictionary.items():
        
        if sentence:
            # print("sentences")
            tokenized_sentence = sent_tokenize(remove_punctuation(message))
            tokenized_chat_message_sentence.append(tokenized_sentence)
        
        else:
            # print("words")
            tokenized_words = word_tokenize(remove_punctuation(message), "english")
            tokenized_chat_message_words.append(tokenized_words)


    if sentence:
        return tokenized_chat_message_sentence
    
    else:
        return tokenized_chat_message_words

In [None]:
token_words = tokenize_chat_data(chat_data_dictionary, sentence=False)
token_sentence = tokenize_chat_data(chat_data_dictionary, sentence=True)

#### Removing Stop Words

In [None]:
def remove_stopwords(tokenized_words_list):
    final_words = []

    for index in tokenized_words_list:

        for word in index:

            if word not in stopwords.words('english'):
                final_words.append(word)
                
    return final_words

In [None]:
stop_words = remove_stopwords(token_words)
cleaned_final_words_counter = Counter(stop_words)

#### Word Frequency Distribution

In [None]:
def get_word_frequency(tokenized_words_list):
    word_frequency = []

    for index in tokenized_words_list:
        
        for word in index:
            word_frequency.append(word)
            
    
    return word_frequency

In [None]:
word_frequency_words = get_word_frequency(token_words)
fdist = FreqDist(word_frequency_words)
fdist.most_common(30)
fdist.plot(30, cumulative = False)
plt.show()

#### Lexicon Normalization and Stemming
Lemmatization - From plural to single + Base form of a word (example better-> good)

In [None]:
def normalize_text(stop_word_list):
    lemma_words = []

    for word in stop_word_list:
        lemma = WordNetLemmatizer().lemmatize(word)
        lemma_words.append(lemma)
        
    return lemma_words

In [None]:
normalized_text = normalize_text(stop_words)

In [None]:
def sentiment_analyze(sentiment_text):
    score = SentimentIntensityAnalyzer().polarity_scores(sentiment_text)
    
    if score['neg'] > score['pos']:
        return("Negative Sentiment")
        
    elif score['neg'] < score['pos']:
        return("Positive Sentiment")
        
    else:
        return("Neutral Sentiment")

In [None]:
def generate_emotion_word_dataframe(chat_dict):
    message_list = []
    sentiment_list = []
    name_list = []

    for name, message in chat_dict.items():
        strip_uid = name.split("_")[0]
        sentiment_list.append(sentiment_analyze(message))
        message_list.append(message)
        name_list.append(strip_uid)

    sentiment_analysis_df = pd.DataFrame(list(zip(name_list, message_list, sentiment_list)), columns=["name", "message", "sentiment"]); sentiment_analysis_df
    
    return sentiment_analysis_df

In [None]:
sentiment_df = generate_emotion_word_dataframe(chat_data_dictionary); sentiment_df

In [None]:
sentiment_count_df = sentiment_df.groupby(["sentiment"], as_index=False).count().rename(columns={'message':'count'}).drop(["name"], axis=1)

plt.pie(sentiment_count_df["count"], 
        labels = sentiment_count_df["sentiment"], 
        colors=["#5DADE2", "#3498DB", "#2874A6"], 
        autopct="%1.1f%%")

plt.tight_layout()
plt.savefig("../../data/sentiment_pie_chart.png")
plt.show()

### Saved Dataframes as CSV

In [None]:
sentiment_analysis_df.to_csv("../data/sentiment_analysis_nltk.csv", index=False)

### Stored variables
List of varibles stored using %store magic

In [None]:
%store chat_data_dictionary 
%store normalized_text