In [None]:
pip install tensorflow==2.5.0 --ignore-installed

In [None]:
import psycopg2 as psy
import json
import os
import time
import pandas as pd
import re
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import re
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import statistics
import time

# Training up a neural network to evaluate sentiment

In [None]:
df_kaggel = pd.read_csv('Tweets.csv')
df_kaggel.shape

In [None]:
#Sentiment analysis preprocessing

df_positive = df_kaggel[df_kaggel['airline_sentiment'] == 'positive']
df_neutral = df_kaggel[df_kaggel['airline_sentiment'] == 'neutral']
df_negative = df_kaggel[df_kaggel['airline_sentiment'] == 'negative']

In [None]:
#Sampling datasets
df_neutral_over = df_neutral.sample(8000, replace=True)
df_negative_over = df_negative.sample(8000, replace=True)
df = pd.concat([df_positive, df_neutral_over, df_negative_over], axis=0)

In [None]:
#text preprocesing
def get_text_processing(text):
    stpword = stopwords.words('english')
    no_punctuation = [char for char in text if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    return ' '.join([word for word in no_punctuation.split() if word.lower() not in stpword])

In [None]:
df = df[['text', 'airline_sentiment']]
df['text'] = df['text'].apply(get_text_processing)
df.head()

In [None]:
#creating dummies
df_dummies = pd.get_dummies(df['airline_sentiment'])
df_ranked = df.drop(['airline_sentiment'], axis=1, inplace=True)
df_complete = pd.concat([df, df_dummies], axis=1)

In [None]:
#Train, test split

X = df_complete["text"].values
y = df_complete.drop("text", axis=1).values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

In [None]:
#vectorization

vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test1 = vect.transform(X_test)


In [None]:
tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)
X_test1 = tfidf.transform(X_test1)
X_train = X_train.toarray()
X_test1 = X_test1.toarray()

In [None]:
#building a model

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

model = Sequential()
model.add(Dense(units=12673, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=4000, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=500, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=3, activation="softmax"))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=2)


In [None]:
#fitting the model

model.fit(
    x=X_train,
    y=y_train,
    batch_size=256,
    epochs=2,
    validation_data=(X_test1, y_test),
    verbose=1,
    callbacks=early_stop,
)

In [None]:
model evaluation

model_score = model.evaluate(X_test, y_test, batch_size=64, verbose=1)
print("Test accuracy:", model_score[1])

In [None]:
#model predictions
df_kaggel['text'] = df_kaggel['text'].apply(get_text_processing)
test_ds  = df_kaggel['text']
test_feature = vect.transform(np.array(test_ds.ravel()))

In [None]:
predict = model.predict(test_feature)
predict = np.argmax(predict, axis=1)
predict

In [None]:
df_kaggel['our_sent'] = predict

In [None]:
dict_rating_categories = {0: 'negative', 1: 'neutral', 2: 'positive'}
df_kaggel['our_sent'] = df_kaggel['our_sent'].map(dict_rating_categories)

In [None]:
def our_model(text, df):
    """text argument comes from the df passed to the function"""
    
    
    #text = df['text']
    #model predictions
    text = text.apply(get_text_processing)
    test_ds  = text
    test_feature = vect.transform(np.array(test_ds.ravel()))

    predict = model.predict(test_feature)
    predict = np.argmax(predict, axis=1)

    df['our_sent'] = predict

    dict_rating_categories = {0: 'negative', 1: 'neutral', 2: 'positive'}
    df['our_sent'] = df['our_sent'].map(dict_rating_categories)
    
    return df

def quick_sentiment(text):
    """Returns sentiment for 1-2 tweets."""
    
    text = pd.Series(text)
    
    #model predictions
    text = text.apply(get_text_processing)
    test_ds  = text
    test_feature = vect.transform(np.array(test_ds.ravel()))

    predict = model.predict(test_feature)
    predict = np.argmax(predict, axis=1)
    
    dict_rating_categories = {0: 'negative', 1: 'neutral', 2: 'positive'}
    predict = np.vectorize(dict_rating_categories.__getitem__)(predict)
    
    return predict

# Evaluating convos

In [None]:
database_host = "localhost"
database_name = "dbl_data_challenge"
database_user = "admin"
database_pass = "vZtbqKNXGz27cQCH"

In [None]:
con = psy.connect(
    host = database_host,
    database = database_name,
    user = database_user,
    password = database_pass
)
cur = con.cursor()

In [None]:
cur.execute("""
    SELECT tweetv2.full_text, tweetv2.user_id_str, tweetv2.timestamp_ms, aba_groups.id_str, aba_groups.aba_level, aba_groups.aba_id, aba_groups.conversation_id
    FROM tweetv2, aba_groups
    WHERE tweetv2.id_str = aba_groups.id_str
    ORDER BY aba_groups.aba_id, aba_groups.aba_level
""")

# Evaluating sentiment for ABA conversations

In [None]:
df_aba = pd.DataFrame(columns=['full_text', 'user_id_str','timestamp_ms', 'id_str', 'aba_level', 'aba_id', 'conversation_id'], data=cur.fetchall())

In [None]:
start = time.time()

df_aba_sent = our_model(df_aba['full_text'], df_aba)

end = time.time()
print(end-start)

In [None]:
df_aba_sent = pd.read_csv('aba_sentiment.csv')

dict_rating_categories_2 = {'negative':-1, 'neutral':0, 'positive':1}
df_aba_sent['our_sent'] = df_aba_sent['our_sent'].map(dict_rating_categories_2)

df_aba_sent = df_aba_sent.drop(['Unnamed: 0'],axis=1)
df_aba_sent

In [None]:
change_list=[]

for aba_id, data in df_aba_sent.groupby("aba_id"): 
    aba_group = df_aba_sent[df_aba_sent["aba_id"] == aba_id]
    
    for level, row in aba_group.groupby("aba_level"): 
        if level==1:
            sent_1 = float(row['our_sent'])
        elif level==3:
            sent_3 = float(statistics.mean(row['our_sent']))
         

    if sent_1 == -1:
        if sent_3 < -.5:
            change_list.append('no_change')
        elif -.5 <= sent_3 <= 0:
            change_list.append('minor_positive_change')
        elif sent_3 > 0:
            change_list.append('positive_change')

    if sent_1 == 1:
        if 0 <= sent_3 <= .5:
            change_list.append('minor_negative_change')
        elif sent_3 < 0:
            change_list.append('negative_change')
        elif sent_3 > .5:
            change_list.append('no_change')

    if sent_1 == 0:
        if .34 < sent_3 < .67: 
            change_list.append('minor_positive_change')
        elif -.34 < sent_3 < -.67:
            change_list.append('minor_negative_change')
        elif -.34 <= sent_3 <= .34:
            change_list.append('no_change')
        elif sent_3 >= .67:
            change_list.append('positive_change')
        elif sent_3 <= .67:
            change_list.append('negative_change')

In [None]:
dict_rating_categories_2_reverse = {-1:'negative', 0:'neutral', 1:'positive',
    'positive_change':'positive_change','negative_change':'negative_change','no_change':'no_change','minor_positive_change':'minor_positive_change','minor_negative_change':'minor_negative_change'}

df_aba_sent.loc[df_aba_sent['aba_level']==2,'our_sent'] = change_list


df_aba_sent['our_sent'] = df_aba_sent['our_sent'].map(dict_rating_categories_2_reverse)

In [None]:
df_aba_sent.to_csv('df_aba_sent.csv')

# Evaluating sentiment for root conversations

In [None]:
cur.execute("""
    SELECT tweetv2.full_text, tweetv2.timestamp_ms, tweetv2.user_id_str, root_groups.id_str, root_groups.root_level, root_groups.root_id
    FROM tweetv2, root_groups
    WHERE tweetv2.id_str = root_groups.id_str
    ORDER BY root_groups.root_id, root_groups.root_level
""")

In [None]:
df_root = pd.DataFrame(columns=['full_text','timestamp_ms', 'user_id_str', 'id_str', 'root_level', 'root_id'], data=cur.fetchall())

In [None]:
start = time.time()

df_root_sent = our_model(df_root['full_text'], df_root)

end = time.time()
print(end-start)

In [None]:
df_root

In [None]:
df_root_sent.to_csv('df_root_sent.csv')