# Twitter Sentiment Analysis - AIR Project

**1. Importing libraries and installing tweepy**

In [None]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import shuffle

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout, Flatten, SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

from wordcloud import WordCloud
import time
import itertools
import re
import seaborn as sns

In [None]:
!pip install tweepy

In [None]:
import tweepy

In [None]:
#twitter API credentials
consumerKey = "V0AbktVHuciimTCOXSgU7Zbfc"
consumerSecret = "mREamg9BsZoJsIZNwDORE50GS7ZqWM9uAz5roornoIxj46xCgR"
accessToken = "1190269761948663809-ucGSjbsNsAaWerbXotnidWkEaxdc3M"
accessTokenSecret = "1vRd2O2IlGxeLc9BEc2zrimB8q0yCukFUHq7Mx5zSRbiG"

#create authentication object
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)
#Set access token and access token secret
authenticate.set_access_token(accessToken, accessTokenSecret)
#Create the API object while passing in the auth info
api = tweepy.API(authenticate)

#extract 10 tweets
posts = api.user_timeline(screen_name = "BillGates", count = 10, lang = "en", tweet_mode="extended")
for tweet in posts:
    print(tweet.full_text + '\n')

**2. Loading Data**

In [None]:
df = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv', encoding = 'latin', header = None)
df.columns = ['sentiment', 'ID', 'date', 'query', 'username', 'tweet']
df.head(10)

Only keep the columns needed for sentiment analysis and change sentiment to positive or negative.

In [None]:
data = df.drop(['ID', 'date', 'query', 'username'], axis = 1)
sent = { 0: 'Negative', 4: 'Positive'}
def label_decoder(label):
    return sent[label]

data.sentiment = data.sentiment.apply(lambda x : label_decoder(x))
data.head()

In [None]:
plt.figure()
sns.countplot(data['sentiment'])
print(data['sentiment'].value_counts())

**3. Preprocessing**

In [None]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
text_cleaning_re = '@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+'

def preprocess(text, stem = False):
    text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
    text = re.sub(r'<3', '<heart>', text)
    text = re.sub(r"[8:=;]['`\-]?[)d]+", '<smile>', text)
    text = re.sub(r"[8:=;]['`\-]?\(+", '<sadface>', text)
    text = re.sub(r"[8:=;]['`\-]?[\/|l*]", '<neutralface>', text)
    text = re.sub(r"[8:=;]['`\-]?p+", '<lolface>', text)
    text = re.sub("[^a-z0-9<>]", ' ', text)
    tokens =[]
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return ' '.join(tokens)

Test a tweet to see the difference before and after preprocessing

In [None]:
data.tweet[24821]

In [None]:
preprocess(data.tweet[24821], True)

Apply preprocessing to all the data

In [None]:
data.tweet = data.tweet.apply(lambda x : preprocess(x))
data.head()

Pull data about specific topics to and see how many tweets are positive versus negative about this topic

In [None]:
data = shuffle(data)
searchWords = '|'.join(['sport', 'player', 'game', 'ball', 'score', 'winner'])
filtered = data[data['tweet'].str.contains(searchWords, case=False)]
pd.set_option('display.max_colwidth', 0)
print("Amount of positive tweets:", filtered['sentiment'].value_counts()['Positive'])
print("Amount of negative tweets:", filtered['sentiment'].value_counts()['Negative'])
filtered.head(10)

In [None]:
searchWords = '|'.join(['job', 'work', 'weekday', 'commute'])
filtered1 = data[data['tweet'].str.contains(searchWords, case=False)]
print("Amount of positive tweets:", filtered1['sentiment'].value_counts()['Positive'])
print("Amount of negative tweets:", filtered1['sentiment'].value_counts()['Negative'])
pd.set_option('display.max_colwidth', 0)
filtered1.head(10)

In [None]:
searchWords = '|'.join(['Obama', 'Trump', 'politics', 'president', 'election'])
filtered2 = data[data['tweet'].str.contains(searchWords, case=False)]
print("Amount of positive tweets:", filtered2['sentiment'].value_counts()['Positive'])
print("Amount of negative tweets:", filtered2['sentiment'].value_counts()['Negative'])
pd.set_option('display.max_colwidth', 0)
filtered2.head(10)

In [None]:
processedtext = list(data.tweet)
data_pos = processedtext[800000:]
data_neg = processedtext[:800000]

In [None]:
word_cloud = WordCloud(max_words = 1000 , width = 1000 , height = 600,
              collocations=False).generate(" ".join(data_pos))
plt.figure(figsize = (20,20))
plt.imshow(word_cloud)

In [None]:
wc = WordCloud(max_words = 1000 , width = 1000 , height = 600,
              collocations=False).generate(" ".join(data_neg))
plt.figure(figsize = (20,20))
plt.imshow(wc)

**4. Split data into training and testing sets**

In [None]:
Train_size = 0.8
max_words = 100000
max_length = 30

In [None]:
train_data, test_data = train_test_split(data, test_size = 1 - Train_size, random_state = 5)
print('size of training data :', len(train_data))
print('size of testing data :',len(test_data))

In [None]:
train_data.head()

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data.tweet)

word_index = tokenizer.word_index
vocab_size = len(word_index)
print('VOCAB_SIZE :', vocab_size)

In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data.tweet), maxlen = max_length)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data.tweet), maxlen = max_length)

print('training x shape :', x_train.shape)
print('testing x shape :', x_test.shape)

In [None]:
encoder = LabelEncoder()
encoder.fit(train_data.sentiment.to_list())

y_train = encoder.transform(train_data.sentiment.tolist())
y_test = encoder.transform(test_data.sentiment.tolist())

y_train = y_train.reshape(1280000,1)
y_test = y_test.reshape(320000,1)

print('y_train shape :', y_train.shape)
print('y_test shape :', y_test.shape)

Import word embeddings and create representations of word vectors for the tweets in our dataset. 

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
GLOVE_EMB = './glove.6B.300d.txt'
EMBEDDING_DIM = 300
LR = 1e-3
BATCH_SIZE = 1024
EPOCHS = 12
MODEL_PATH = '.../output/kaggle/working/best_model.hdf5'

In [None]:
embeddings_index = {}
file = open(GLOVE_EMB)
for line in file:
    values = line.split()
    word = value = values[0]
    coef = np.asarray(values[1:],dtype = 'float32')
    embeddings_index[word] = coef
    
file.close()
print('Found {} word vectors'.format(len(embeddings_index)))

In [None]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

**5. Begin Training**

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                           EMBEDDING_DIM,
                                           weights = [embedding_matrix],
                                           input_length = max_length,
                                           trainable = False)

In [None]:
sequence_input = Input(shape = (max_length), dtype = 'int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5, activation = 'relu')(x)
x = Bidirectional(LSTM(64, dropout = 0.2, recurrent_dropout = 0.2))(x)
x = Dense(512, activation = 'relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation = 'relu')(x)
outputs = Dense(1, activation = 'sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)

In [None]:
model.summary()

In [None]:
model.compile(optimizer = Adam(learning_rate = LR),
             loss ='binary_crossentropy',
             metrics = ['accuracy'])

reduction = ReduceLROnPlateau(factor = 0.1,
                                min_lr = 0.0001,
                                monitor = 'val_loss',
                                verbose = 1)
history = model.fit(x_train,
                   y_train,
                   batch_size = BATCH_SIZE,
                   epochs = EPOCHS,
                   validation_data = (x_test, y_test),
                   callbacks = [reduction])

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc))


plt.figure(figsize = (9,4))
plt.plot(acc, color = 'green', label = 'Training Accuracy')
plt.plot(val_acc, color = 'red', label = ' Validation Accuracy')
plt.legend()

plt.figure(figsize= (9,4))
plt.plot(loss,color = 'green', label = 'Training Loss')
plt.plot(val_loss, color = 'red', label = ' Validation Loss')
plt.legend()

**6. Accuracy and Predictions**

In [None]:
def decode_sentiment(score):
    return 'Positive' if score > 0.5 else 'Negative'
        
scores = model.predict(x_test, verbose = 1, batch_size = 10000)
y_pred_D = [decode_sentiment(score) for score in scores]

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion Matrix',
                          cmap = plt.cm.Blues):
    plt.imshow(cm, interpolation = 'nearest', cmap =cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes,rotation = 30)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
             horizontalalignment= 'center',
             color = "white" if cm[i,j]>thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('predicted label')

cnf_matrix = confusion_matrix(test_data.sentiment.tolist(), y_pred_D)
plt.figure()
plot_confusion_matrix(cnf_matrix, classes = test_data.sentiment.unique(), title = 'confusion matrix')
plt.show()

In [None]:
accuracy = max(acc)
print('Accuracy of model :', accuracy)

Precision, recall, and f1-score of the model

In [None]:
print(classification_report(list(test_data.sentiment), y_pred_D))

In [None]:
def final_sentiment(score):
    if score > 0.6:
            return 'Positive'
    elif (score > 0.4 and score < 0.6):
            return 'Neutral'
    else:
            return'Negative'
        
def pred(text):
    start_at = time.time()
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=max_length) 
    score = model.predict([x_test])[0]
    label = final_sentiment(score)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

**7. Test model on actual tweets**

Pull tweets from a certain user (Elon Musk)

In [None]:
posts = api.user_timeline(screen_name = "elonmusk", count = 5, lang = "en", tweet_mode="extended")

sums = 0.
counter = 0
pos = 0
neg = 0
neu = 0

print('--------Tweets: ')
print('\n')

for tweet in posts:
    print(tweet.full_text)
    prediction = pred(tweet.full_text)
    
    temp = prediction.get('score')
    sums += temp
    counter += 1
    
    temp2 = prediction.get('label')
    if temp2 == 'Positive':
        pos += 1
    elif temp2 == 'Negative':
        neg += 1
    else:
        neu += 1
    print(prediction)    
    print('\n')
    
print('---------------------------------------------------------------------------------------')
print('There are '+ str(pos)+' positive tweets, '+str(neg) +' negative tweets, and '+str(neu) +' neutral tweets')   
avg = sums/counter
print('The average score of all the retrieved tweets is ' +str(avg))
if avg > 0.6:
    print('The final grade is therefore positive.')
elif (avg > 0.4 and avg < 0.6):
    print('The final grade is therefore neutral.')
else:
    print('The final grade is therefore negative.')

Pull tweets about a certain topic (sports)

In [None]:
search_term = '|'.join(['sport', 'player', 'game', 'ball', 'score', 'winner'])
tweet_amount = 5
sums = 0.
counter = 0
pos = 0
neg = 0
neu = 0
tweets = tweepy.Cursor(api.search_tweets, q = search_term, lang = 'en').items(tweet_amount)
print('--------Tweets: ')
print('\n')
for tweet in tweets:
    print(tweet.text)
    prediction = pred(tweet.text)
    
    temp = prediction.get('score')
    sums += temp
    counter += 1
    
    temp2 = prediction.get('label')
    if temp2 == 'Positive':
        pos += 1
    elif temp2 == 'Negative':
        neg += 1
    else:
        neu += 1
    print(prediction)    
    print('\n')
print('---------------------------------------------------------------------------------------')
print('There are '+ str(pos)+' positive tweets, '+str(neg) +' negative tweets, and '+str(neu) +' neutral tweets')   
avg = sums/counter
print('The average score of all the retrieved tweets is ' +str(avg)) 
if avg > 0.6:
    print('The final grade is therefore positive')
elif (avg > 0.4 and avg < 0.6):
    print('The final grade is therefore neutral')
else:
    print('The final grade is therefore negative')

Pull tweets with certain queries (cars)

In [None]:
client = tweepy.Client(bearer_token = 'AAAAAAAAAAAAAAAAAAAAADvCkQEAAAAAqBldguiK%2FewFmwE1Cpd1UbQrlew%3D4Ysvdx21ITFfLHDHTL5nnV8q5KCNJ8dOLrYf23yIE8S3OOnCvw')
query = '#cars -is:retweet lang:en'
tweets = client.search_recent_tweets(query=query, tweet_fields=['context_annotations', 'created_at'], max_results=10)
sums = 0.
counter = 0
pos = 0
neg = 0
neu = 0
for tweet in tweets.data:
    print(tweet.text)
    prediction = pred(tweet.text)
    
    temp =prediction.get('score')
    sums += temp
    counter += 1
    
    temp2 = prediction.get('label')
    if temp2 == 'Positive':
        pos += 1
    elif temp2 == 'Negative':
        neg += 1
    else:
        neu += 1
    print(prediction)    
    print('\n')
print('---------------------------------------------------------------------------------------')
print('There are '+ str(pos)+' positive tweets, '+str(neg) +' negative tweets, and '+str(neu) +' neutral tweets')   
avg = sums/counter
print('The average score of all the retrieved tweets is ' +str(avg)) 
if avg > 0.6:
    print('The final grade is therefore positive')
elif (avg > 0.4 and avg < 0.6):
    print('The final grade is therefore neutral')
else:
    print('The final grade is therefore negative')