# 1. Importing Libraries

In [None]:
import pymongo
from pymongo import MongoClient

import pandas as pd
import numpy as np
import string
import re
import os

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
tokenizer= TreebankWordTokenizer()
from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
skl_stopwords=_stop_words.ENGLISH_STOP_WORDS
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import wordcloud
from wordcloud import WordCloud 

import seaborn as sns
import matplotlib.pyplot as plt

# 2. Initializing MongoDB

In [None]:

## MONGODB
HOST_MDB = 'mymongo' # if in docker it would be the container name
PORT_MDB = 27017
# Connection string
conn_string_mdb = f"mongodb://{HOST_MDB}:{PORT_MDB}" 
client = MongoClient(conn_string_mdb)

## POSTGRES
USERNAME_PG = 'postgres'
PASSWORD_PG = 'postgres'
HOST_PG = '' # if in docker it would be the container name
PORT_PG = 5432
DATABASE_NAME_PG = 'posty_tweets'

# Connection string
conn_string_pg = f"postgresql://{USERNAME_PG}:{PASSWORD_PG}@{HOST_PG}:{PORT_PG}/{DATABASE_NAME_PG}" 
pg = create_engine(conn_string_pg,client_encoding='utf8')

pg.execute("""
CREATE TABLE IF NOT EXISTS song_table (
    neg numeric,
    new numeric,
    pos numeric,
    compound numeric,
    tweet TEXT,
    Sentiment_label TEXT
);


# 3. Calling database and Collection

In [None]:
def extract():
    mydb = client["tweetcollector"]
    mycol = mydb["test"] #vaxdata
    ## 3.1 Accessing all the tweets
    tweets = list(mycol.find())
    ## 3.2 Converting the data to pandas dataframe
    tweets_df = pd.DataFrame(data = tweets)
    #tweets_df.head()
    return(tweets_df)

# 4 Text Cleaning and Processing

def text_cleaning(text, stopwords=skl_stopwords):
    panc = string.punctuation + '–'+ '‘'+ '’'+ '“'+'”'
    text = [i for i in text if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+",i)]
    text = ''.join([ch for ch in text if ch not in panc]) #remove punctuation
    text = re.sub(pattern= '[0-9]+', string= text, repl = ' ' )
    text = re.sub(pattern= '\s', string= text, repl = ' ' )
    text = re.sub(pattern = '(aah|aaaa|aa)', string = text, repl ='')
    text = re.sub(pattern = '\#\S+', string = text, repl ='') # removes hastags
    text = re.sub(pattern = '(\#|@|http\S+|[0-9]|"|)',string = text, repl ='') #removes url, #,@,space, numbers
    return text 

def transform(tweets_df):
    #converting extacted tweet to list
    sample_text = tweets_df.tweet
    sample_lst = sample_text.tolist()
    
    #removing some come words
    sample_lst
    text_clean = []
    for i in sample_lst:
        prohibitedWords = ['Antivax','antivax','People','Covid','people','covid']
        text = re.compile('|'.join(map(re.escape, prohibitedWords)))
        text_clean.append(text.sub("",i))
    
    
    s = SentimentIntensityAnalyzer() #initializing the model
    
    sentiment_df = pd.DataFrame() #initializin the dataframe to store the sentiments
    
    for i,lst in enumerate(text_clean):#sample_lst
        text = text_cleaning(lst,stopwords=skl_stopwords) #textcleaning
        score = s.polarity_scores(text) #calculating sentiment
        score['tweet'] = text #updating the output of sentiment analysis with the respective tweet
        sentiment_df = sentiment_df.append(score,ignore_index = True) # updating the dataframe with results (Sentiments)
    sentiment_df.reindex(columns=['tweet','neg','neu','pos','compound']) #reindexing the column 
    
    # labeling sentiments
    Sentiment_label = []
    for i, row in sentiment_df.iterrows():
        if (row['compound'] > 0.5):
            Sentiment_label.append('Positive')
        elif (row['compound'] >= -0.5 and row['compound'] <= 0.5):
            Sentiment_label.append('Neutral')
        else:
            Sentiment_label.append('Negative')
    
    sentiment_df['Sentiment_label'] = Sentiment_label
    #sns.countplot(data = sentiment_df, x  =sentiment_df['Sentiment_label'])
    
    
    labels = ['Positive','Negative','Neutral']#'Highly Positive','Highly Negative'
    for i, label in enumerate(labels):
        i = i+1
        print('-'*110)
        print(i,'.','#'*30,label,'#'*30)
        print('-'*110)
        senti_labels = sentiment_df[sentiment_df['Sentiment_label'] == label]
        #print(sneti_labels)
        wordCloud = WordCloud(
                background_color="black", 
                width=1600, 
                height=800,
                stopwords=skl_stopwords).generate(' '.join(senti_labels.tweet))

        plt.figure(figsize=(15,10), facecolor='k')
        plt.imshow(wordCloud,interpolation="bilinear")
        plt.axis("off")
        return plt.show()
    logging.critical("\n---TRANSFORMATION COMPLETED---")

def load(sentiments):
    sentiment_df.to_sql(pg, if_exists='replace')
    logging.critical(f"Sentiment loaded into postgres")
    
    
    
TWEET = extract()
sentiment_df = transform(tweets_df)
load(sentiment_df)