In [1]:
import pymysql
from textblob import TextBlob
from wordcloud import WordCloud
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import plotly.express as px

# Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [2]:
import os

host_name = os.getenv('HOST_NAME')
passwd = os.getenv('AWS_PASSWORD')
user='group2'
database='group2db'

In [None]:
# Connecting AWS mysql Database
con = pymysql.connect(host=host_name,user=user,passwd=passwd,db=database)

In [None]:
# Reading sql table into pandas DataFrame
tweet_df = pd.read_sql_query('''select * from tweets_locations2''', con)

In [None]:
tweet_df

In [None]:
tweet_df = tweet_df.drop(columns=['tweet_id'], axis=1)

In [None]:
tweet_df.info()

In [None]:
import string
string.punctuation

In [None]:
#  Function to clean the tweets
def remove_punc(message):
    Test_punc_removed = [ char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    return Test_punc_removed_join
tweet_df['tweet'] = tweet_df['tweet'].apply(remove_punc)
tweet_df

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','will','aap','co','day','user','tweet','today','week','year', 'https','httpstco', 'rt', 'RT'])

In [None]:
# Remove stopwords and remove short words (less than 2 characters)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) >= 2:
            result.append(token)
    return result
tweet_df['tweet'] = tweet_df['tweet'].apply(preprocess).apply(lambda x: " ".join(x))
tweet_df

In [None]:
# Function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

# Creating two new columns Subjectivity, Polarity
tweet_df['Subjectivity'] = tweet_df['tweet'].apply(getSubjectivity)
tweet_df['Polarity'] = tweet_df['tweet'].apply(getPolarity)

In [None]:
tweet_df

In [None]:
# Create a function to compute the negative, neutral and positive analysis
def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return "Neutral"
    else:
        return 'Positive'

tweet_df['Analysis'] = tweet_df['Polarity'].apply(getAnalysis)

tweet_df

In [None]:
tweet_df.to_csv("TwitterSentimentData.csv")

In [None]:
sns.countplot(tweet_df['Analysis']).set_title('Twitter Sentiment Analysis')

In [None]:
#positive sentiment
plt.figure(figsize= (20, 20))
wc = WordCloud(max_words = 900, width = 1600, height = 800).generate(" ".join(tweet_df[tweet_df['Polarity'] > 0.0]['tweet']))
plt.figure(figsize=(20, 10), facecolor='white', edgecolor='blue')

plt.imshow(wc)

In [None]:
# nutral sentiment word cloud

plt.figure(figsize= (20, 20))
wc = WordCloud(background_color='white', max_words = 500, width = 1600, height = 800).generate(" ".join(tweet_df[tweet_df['Polarity'] == 0.0]['tweet']))
plt.imshow(wc)

In [None]:
# negative sentiment word cloud

plt.figure(figsize= (20, 20))
wc = WordCloud(background_color='white', max_words = 500, width = 1600, height = 800).generate(" ".join(tweet_df[tweet_df['Polarity'] < 0.0]['tweet']))
plt.imshow(wc)

In [None]:
list_of_words = []
# for i in tweet_df['tweet']:
for i in tweet_df['tweet'].str.split(' '):
    for j in i:
        list_of_words.append(j)

In [None]:
total_words = len(list(set(list_of_words)))
total_words

In [None]:
# split the data into test and train

X = tweet_df['tweet'].str.split(' ')
y = tweet_df['Polarity']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [None]:
# X_train

In [None]:
# X_test

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
# create a tokenizer to tokenize the words and create sequences of tokenized words
tokenizer = Tokenizer(num_words=total_words)
tokenizer.fit_on_texts(X_train)

# Training data
train_sequences = tokenizer.texts_to_sequences(X_train)

# Testing data
test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
print('The encoding for document\n', X_train[1:2], 'is:', train_sequences[1])

In [None]:
# Add padding to training and testing
padded_train = pad_sequences(train_sequences, maxlen=29)
padded_test = pad_sequences(test_sequences, maxlen=29)

In [None]:
for i, doc in enumerate(padded_train[:3]):
    print('The padded encoding for document:',  i+1, 'is', doc)

In [None]:
y_train

In [None]:
y_train_cat = to_categorical(y_train, 2)
y_test_cat = to_categorical(y_test, 2)

In [None]:
y_train_cat.shape

In [None]:
y_test_cat.shape

In [None]:
y_train_cat

In [None]:
# Sequential model
model = Sequential()
model.add(Embedding(total_words, output_dim = 512))

model.add(LSTM(256))

model.add(Dense(128, activation = 'relu'))

model.add(Dropout(0.3))

model.add(Dense(2, activation='softmax'))

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['acc'])
model.summary()

In [None]:
# train the model
model.fit(padded_train, y_train_cat, validation_split=0.05, epochs = 2)

In [None]:
# make prediction
pred = model.predict(padded_test)

In [None]:
# pred

In [None]:
# make prediction
# argmax finds the argument that gives the maximum value to find the class with the highest probability(predictions).
#index of maximum number
prediction = []
for i in pred:
    prediction.append(np.argmax(i))

In [None]:
# prediction

In [None]:
# Prediction
# list containing original values
original = []
for i in y_test_cat:
    original.append(np.argmax(i))

In [None]:
# original

In [None]:
# accuracy score on text data
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(original, prediction)

In [None]:
accuracy

In [None]:
# plot confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(original, prediction)
sns.heatmap(cm, annot = True)