#### Import and install the required libraries

In [None]:
pip install tensorflow==2.5.0 --ignore-installed

In [None]:
import psycopg2 as psy
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import tensorflow as tf
import matplotlib.pyplot as plt
import json
import os
import time
import statistics
import string

In [None]:
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from string import punctuation

In [None]:
nltk.download("stopwords")

#### Establish connection with the PostgreSQL database

In [None]:
database_host = "localhost"
database_name = "dbl_data_challenge"
database_user = "admin"
database_pass = "vZtbqKNXGz27cQCH"

In [None]:
con = psy.connect(
    host = database_host,
    database = database_name,
    user = database_user,
    password = database_pass
)
cur = con.cursor()

#### Training a neural network to be able to evaluate sentiment of tweets / conversations

In [None]:
df_kaggel = pd.read_csv('Tweets.csv')
df_kaggel.shape

#### Text preprocessing

In [None]:
def get_text_processing(text):
    stpword = stopwords.words('english')
    no_punctuation = [char for char in text if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    return ' '.join([word for word in no_punctuation.split() if word.lower() not in stpword])

In [None]:
df = df_kaggel.copy()
df = df[['text', 'airline_sentiment']]
df['text'] = df['text'].apply(get_text_processing)
df.head()

#### Creating dummies

In [None]:
df_dummies = pd.get_dummies(df['airline_sentiment'])
df_ranked = df.drop(['airline_sentiment'], axis=1, inplace=True)
df_complete = pd.concat([df, df_dummies], axis=1)

#### Train, test and split

In [None]:
X = df_complete["text"].values
y = df_complete.drop("text", axis=1).values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

#### Vectorization

In [None]:
vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test1 = vect.transform(X_test)

In [None]:
tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)
X_test1 = tfidf.transform(X_test1)
X_train = X_train.toarray()
X_test1 = X_test1.toarray()

#### Creating the model

In [None]:
model = Sequential()
model.add(Dense(units=12673, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=4000, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=500, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=3, activation="softmax"))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=2)

#### Fitting the model

In [None]:
model.fit(
    x=X_train,
    y=y_train,
    batch_size=256,
    epochs=2,
    validation_data=(X_test1, y_test),
    verbose=1,
    callbacks=early_stop,
)

#### Evaluation the model

In [None]:
model_score = model.evaluate(X_test1, y_test, batch_size=64, verbose=1)
print("Test accuracy:", model_score[1])

#### Model predictions

In [None]:
df_kaggel['text'] = df_kaggel['text'].apply(get_text_processing)
test_ds  = df_kaggel['text']
test_feature = vect.transform(np.array(test_ds.ravel()))

In [None]:
predict = model.predict(test_feature)
predict = np.argmax(predict, axis=1)
predict

In [None]:
df_kaggel['our_sent'] = predict

In [None]:
dict_rating_categories = {0: 'negative', 1: 'neutral', 2: 'positive'}
df_kaggel['our_sent'] = df_kaggel['our_sent'].map(dict_rating_categories)

#### Modelling and quick sentiment functions

In [None]:
def our_model(text, df):
    """
    Text argument comes from the df passed to the function
    """
    text = text.apply(get_text_processing)
    test_ds  = text
    test_feature = vect.transform(np.array(test_ds.ravel()))
    predict = model.predict(test_feature)
    predict = np.argmax(predict, axis=1)
    df['our_sent'] = predict
    dict_rating_categories = {0: 'negative', 1: 'neutral', 2: 'positive'}
    df['our_sent'] = df['our_sent'].map(dict_rating_categories)
    return df


def quick_sentiment(text):
    """
    Returns sentiment for 1-2 tweets
    """
    text = pd.Series(text)
    #model predictions
    text = text.apply(get_text_processing)
    test_ds  = text
    test_feature = vect.transform(np.array(test_ds.ravel()))
    predict = model.predict(test_feature)
    predict = np.argmax(predict, axis=1)
    dict_rating_categories = {0: 'negative', 1: 'neutral', 2: 'positive'}
    predict = np.vectorize(dict_rating_categories.__getitem__)(predict)
    return predict

#### Testing performance

In [None]:
our_model(df_kaggel["text"], df_kaggel)

# true: positive, ours: negative
pos_is_neg = len(df_kaggel[(df_kaggel["airline_sentiment"] == "positive") & (df_kaggel["our_sent"] == "negative")])
# true: positive, ours: neutral
pos_is_neu = len(df_kaggel[(df_kaggel["airline_sentiment"] == "positive") & (df_kaggel["our_sent"] == "neutral")])
# true: neutral, ours: positive
neu_is_pos = len(df_kaggel[(df_kaggel["airline_sentiment"] == "neutral") & (df_kaggel["our_sent"] == "positive")])
# true: neutral, ours: negative
neu_is_neg = len(df_kaggel[(df_kaggel["airline_sentiment"] == "neutral") & (df_kaggel["our_sent"] == "negative")])
# true: negative, ours: neutral
neg_is_neu = len(df_kaggel[(df_kaggel["airline_sentiment"] == "negative") & (df_kaggel["our_sent"] == "neutral")])
# true: negative, ours: positive
neg_is_pos = len(df_kaggel[(df_kaggel["airline_sentiment"] == "negative") & (df_kaggel["our_sent"] == "positive")])

pos_is_neg, pos_is_neu, neu_is_pos, neu_is_neg, neg_is_neu, neg_is_pos
worse_sentiment = pos_is_neg + pos_is_neu + neu_is_neg
better_sentiment = neu_is_pos + neg_is_neu + neg_is_pos

In [None]:
worse_sentiment, better_sentiment

#### Importing conversations from the database

In [None]:
cur.execute("""
    SELECT tweets.full_text, tweets.user_id_str, tweets.timestamp_ms, aba_groups.id_str, aba_groups.aba_level, aba_groups.aba_id, aba_groups.conversation_id
    FROM tweets, aba_groups
    WHERE tweets.id_str = aba_groups.id_str
    ORDER BY aba_groups.aba_id, aba_groups.aba_level
""")

#### Evaluating sentiment for ABA conversations

In [None]:
df_aba = pd.DataFrame(columns=['full_text', 'user_id_str','timestamp_ms', 'id_str', 'aba_level', 'aba_id', 'conversation_id'], data=cur.fetchall())

In [None]:
start = time.time()

df_aba_sent = our_model(df_aba['full_text'], df_aba)

end = time.time()
print(end-start)

In [None]:
df_aba_sent = pd.read_csv('aba_sentiment.csv')

dict_rating_categories_2 = {'negative':-1, 'neutral':0, 'positive':1}
df_aba_sent['our_sent'] = df_aba_sent['our_sent'].map(dict_rating_categories_2)

df_aba_sent = df_aba_sent.drop(['Unnamed: 0'],axis=1)
df_aba_sent

In [None]:
change_list=[]

for aba_id, data in df_aba_sent.groupby("aba_id"): 
    aba_group = df_aba_sent[df_aba_sent["aba_id"] == aba_id]
    
    for level, row in aba_group.groupby("aba_level"): 
        if level==1:
            sent_1 = float(row['our_sent'])
        elif level==3:
            sent_3 = float(statistics.mean(row['our_sent']))
         

    if sent_1 == -1:
        if sent_3 < -.5:
            change_list.append('no_change')
        elif -.5 <= sent_3 <= 0:
            change_list.append('minor_positive_change')
        elif sent_3 > 0:
            change_list.append('positive_change')

    if sent_1 == 1:
        if 0 <= sent_3 <= .5:
            change_list.append('minor_negative_change')
        elif sent_3 < 0:
            change_list.append('negative_change')
        elif sent_3 > .5:
            change_list.append('no_change')

    if sent_1 == 0:
        if .34 < sent_3 < .67: 
            change_list.append('minor_positive_change')
        elif -.34 < sent_3 < -.67:
            change_list.append('minor_negative_change')
        elif -.34 <= sent_3 <= .34:
            change_list.append('no_change')
        elif sent_3 >= .67:
            change_list.append('positive_change')
        elif sent_3 <= .67:
            change_list.append('negative_change')

In [None]:
dict_rating_categories_2_reverse = {-1:'negative', 0:'neutral', 1:'positive',
    'positive_change':'positive_change','negative_change':'negative_change','no_change':'no_change','minor_positive_change':'minor_positive_change','minor_negative_change':'minor_negative_change'}

df_aba_sent.loc[df_aba_sent['aba_level']==2,'our_sent'] = change_list


df_aba_sent['our_sent'] = df_aba_sent['our_sent'].map(dict_rating_categories_2_reverse)

In [None]:
df_aba_sent.to_csv('df_aba_sent.csv')

#### Evaluating sentiment for root conversations

In [None]:
cur.execute("""
    SELECT tweets.full_text, tweets.timestamp_ms, tweets.user_id_str, root_groups.id_str, root_groups.root_level, root_groups.root_id
    FROM tweets, root_groups
    WHERE tweets.id_str = root_groups.id_str
    ORDER BY root_groups.root_id, root_groups.root_level
""")

In [None]:
df_root = pd.DataFrame(columns=['full_text','timestamp_ms', 'user_id_str', 'id_str', 'root_level', 'root_id'], data=cur.fetchall())

In [None]:
start = time.time()

df_root_sent = our_model(df_root['full_text'], df_root)

print("This took: "+str(time.time() - start)+" seconds")

In [None]:
df_root

#### Saving the dataframe to local storage in csv format

In [None]:
df_root_sent.to_csv('df_root_sent.csv')

In [None]:
cur.close()
con.close()