**About the Data:** <br>
The data consists of `5000` tweets which are about `'Global Warming'`. I have performed `Topic Modelling` on the tweets text and interpreted the topics by analysing the distribution of words in the topics. After interpreting the topics, I further assigned a `'Theme'` to each topic by conducting my own research according to the distributed words. <br>

In [None]:
import pandas as pd
import numpy as np

import nltk

import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary


from sklearn.feature_extraction.text import CountVectorizer

**Read the data and check basic properties**

In [None]:
data = pd.read_csv('global_warming_tweets.csv')

In [None]:
# Print the first few rows of data
print(data.head())
print('----------------------------')
print(data.values)

In [None]:
# Verify the number of observations in data

num_of_data = 5000

if len(data) == num_of_data:
    print(True)
else:
    print(False)

### Text preprocessing

**Remove duplicates**

In [None]:
# Print the number of duplicate tweets

num_of_duplicates = 0
for line in data.tweet.duplicated():
    if line == True:
        num_of_duplicates += 1
        
print(num_of_duplicates)


In [None]:
# Drop the duplicates and print the number of rows in the new set of data

new_data = data.drop_duplicates()
print(len(new_data))


**Create Preprocessing functions** <br>
#### Remove @mention, URLs, and stop words

In [None]:
import re
import string
from nltk.corpus import stopwords

In [None]:
# Create a stop words list

stop_list = stopwords.words('english') + list(string.punctuation)

print(stop_list)


In [None]:
# Create tokenizer/lemmatizer objects

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

tokenized_data = [word_tokenize(line) for line in new_data.tweet]
# print(tokenized_data)

WNL = WordNetLemmatizer()
lemma_data = []

for a in tokenized_data:
    lemma_data.append([WNL.lemmatize(word) for word in a])
    
print(lemma_data)


In [None]:
# A function created to remove @mentions, www, http URLs and expand n't to not

def remove(s):
    new_sentence = re.sub(r'@\w+', '', s)
    new_sentence = re.sub(r'http?://\S+', '', new_sentence)
    new_sentence = re.sub(r'www\S+', '', new_sentence)
    return new_sentence


In [None]:
# Define text processing function

def process_alltweets(listtweets):
    for sentence in listtweets:
        new_sentence = sentence.lower()
        new_sentence = remove(new_sentence)
        sentence_tokens = word_tokenize(new_sentence)
        sentence_tokens = [token for token in sentence_tokens if token not in stop_list]
        sentence_tokens = [WNL.lemmatize(token) for token in sentence_tokens]
        tweets_token.append(' '.join(sentence_tokens))
    return tweets_token


In [None]:
tweets_token = []
file = open('global_warming_tweets.csv', encoding='utf-8')
tweets = file.readlines()
tweets.pop(0)
processed_tweets = process_alltweets(tweets)
print(processed_tweets)

In [None]:
# Create vector object here

tweets_corpus = process_alltweets(tweets)
cv = CountVectorizer(tweets_corpus)
count_vect = cv.fit_transform(processed_tweets)
print(count_vect)


In [None]:
# Save the vocabulary and result of toarray() here 

vocab = cv.get_feature_names()
# print(vocab)
print("Number of features :\n", len(vocab))
print('---------------------------')
cv_matrix = count_vect.toarray()
print(cv_matrix)


In [None]:
# Create the dataframe

vect_df = pd.DataFrame(cv_matrix, columns=vocab)
print(vect_df)


In [None]:
# Get the word counts by applying sum()

word_counts = vect_df.sum(axis=0)
print(word_counts)


In [None]:
# Sort the word counts to determine the words with highest frequency

sorted_word_counts = word_counts.sort_values(ascending=False)
top_20_words = sorted_word_counts[:20]
print(top_20_words)


**Top 20 words with frequency in descending order**

1. climate, 6124
2. change, 5572
3. global, 5272
4. warming, 5108
5. rt, 1660
6. link, 1642
7. via, 856
8. new, 526
9. snow, 438
10. news, 430
11. bill, 424
12. tcot, 368
13. energy, 348
14. science, 342
15. scientist, 320
16. green, 320
17. say, 316
18. report, 306
19. people, 288
20. earth, 284

In [None]:

from nltk.tokenize import word_tokenize

processed_tweets_token = [word_tokenize(i) for i in processed_tweets]
print(processed_tweets_token)


**Create gensim dictionary**

In [None]:
# Create a dictionary representation of the tweets.

dictionary = gensim.corpora.Dictionary(processed_tweets_token)

print('Sample word to number mappings:', list(dictionary.items())[:15])
print('Total Vocabulary Size:', len(dictionary))


In [None]:
# Filter out words that occur in less than 5 documents.

dictionary.filter_extremes(no_below=5, no_above=0.5)

print('Total Vocabulary Size after filters:', len(dictionary))

**Bag of words model**

In [None]:
# Create the Bag of words model in gensim and create list of tuples for every doc/tweet containing (wordid, frequency)

# Transform dictionary into bag of words vectors

corpus_bag_of_words = [dictionary.doc2bow(text) for text in processed_tweets_token]

print(corpus_bag_of_words[0])
print('\n')
print(processed_tweets_token[0])


In [None]:
# Create a gensim LDA model by using the bag of words created

num_topics = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus_bag_of_words, num_topics=num_topics, id2word=dictionary, passes=5, random_state=0)

In [None]:
# Check the type of the LDA model

print(type(ldamodel))

In [None]:
# Print topics with word distribution

for num, topic in ldamodel.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(num) + ': ' + topic)


**Refine the results of the topics**

In [None]:
# Update stop list which was created previously

new_stopwords = ['global', 'warming', '\'s', 'rt', '..', '...', '``', "''", '--', 'link', 'via']
stop_list = stopwords.words('english') + list(string.punctuation) + new_stopwords

print(stop_list)

In [None]:
# Updated function, that was created previously, which includes the processing step to expand 'n\'t' to 'not'

def remove(s):
    new_sentence = re.sub(r'@\w+', '', s)
    new_sentence = re.sub(r'http?://\S+', '', new_sentence)
    new_sentence = re.sub(r'www\S+', '', new_sentence)
    new_sentence = re.sub(r'n\'t', 'not', new_sentence) # this is the new line of code
    return new_sentence


In [None]:
def process_alltweets(listtweets):
    for sentence in listtweets:
        new_sentence = sentence.lower()
        new_sentence = remove(new_sentence)
        sentence_tokens = word_tokenize(new_sentence)
        sentence_tokens = [token for token in sentence_tokens if token not in stop_list]
        sentence_tokens = [WNL.lemmatize(token) for token in sentence_tokens]
        tweets_token.append(' '.join(sentence_tokens))
    return tweets_token

In [None]:
tweets_token = []
file = open('global_warming_tweets.csv', encoding='utf-8')
tweets = file.readlines()
tweets.pop(0)
processed_tweets = process_alltweets(tweets)
print(processed_tweets)

In [None]:

from nltk.tokenize import word_tokenize

processed_tweets_token = [word_tokenize(i) for i in processed_tweets]
print(processed_tweets_token)


In [None]:
# Create a dictionary representation of the tweets.

dictionary = gensim.corpora.Dictionary(processed_tweets_token)

print('Sample word to number mappings:', list(dictionary.items())[:15])
print('Total Vocabulary Size:', len(dictionary))


In [None]:
# Filter out words that occur in less than 5 documents.

dictionary.filter_extremes(no_below=5, no_above=0.5)

print('Total Vocabulary Size after filters:', len(dictionary))

In [None]:
# Create the Bag of words model in gensim and create list of tuples for every doc/tweet containing (wordid, frequency)

# Transform dictionary into bag of words vectors

corpus_bag_of_words = [dictionary.doc2bow(text) for text in processed_tweets_token]

print(corpus_bag_of_words[0])
print('\n')
print(processed_tweets_token[0])


In [None]:
# Check the type of the LDA model

print(type(ldamodel))# Create a gensim LDA model by using the bag of words created

num_topics = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus_bag_of_words, num_topics=num_topics, id2word=dictionary, passes=5, random_state=0)

In [None]:
# Check the type of the LDA model

print(type(ldamodel))

In [None]:
# Print topics with word distribution

for num, topic in ldamodel.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(num) + ': ' + topic)


**Analysis: Interpretation of topics to assign Themes**

**Topic 0 :** <br>
- 0.022*"new" + 0.020*"science" + 0.019*"energy" + 0.018*"obama" + 0.012*"earth" + 0.012*"cause" + 0.011*"study" + 0.011*"say" + 0.011*"agency" + 0.010*"news" <br>
<font color = 'blue'>`Theme`: New Renewable Energy for America Initiative by Barack Obama </font> <br>


**Topic 1 :** <br>
- 0.020*"news" + 0.013*"un" + 0.012*"great" + 0.010*"good" + 0.009*"science" + 0.009*"issue" + 0.009*"health" + 0.008*"environmental" + 0.007*"public" + 0.007*"solution" <br>
<font color = 'blue'>`Theme`: UNEP : United Nations Environment Programme </font> <br>

**Topic 2 :** <br>
- 0.014*"new" + 0.014*"scientist" + 0.014*"report" + 0.013*"u.s." + 0.010*"help" + 0.009*"allergy" + 0.009*"blame" + 0.008*"call" + 0.008*"india" + 0.007*"tcot" <br>
<font  color = 'blue'>`Theme`: The United States and India - Moving Forward Together on Climate Change </font><br>

**Topic 3 :** <br>
- 0.038*"bill" + 0.014*"graham" + 0.013*"law" + 0.012*"stop" + 0.011*"california" + 0.011*"state" + 0.010*"senate" + 0.009*"get" + 0.009*"put" + 0.008*"senator" <br>
<font color = 'blue'>`Theme`: Senator Lindsey Graham Pulls Support for Major Senate Climate Bill </font><br>

**Topic 4 :** <br>
- 0.027*"snow" + 0.019*"people" + 0.018*"tcot" + 0.016*"gore" + 0.014*"dc" + 0.014*"al" + 0.012*"believe" + 0.011*"world" + 0.009*"storm" + 0.009*"conference" <br>
<font color = 'blue'>`Theme`: Al Gore : An Inconvenient Truth </font><br>

**Interpretation of topics based on research**

**Topic 0**
- Words such as 'new', 'energy', 'obama' and 'earth' indicate that Topic 0 is related to a new energy plan which is initiated by Barack Obama to save the Earth from climate problems. Therefore, Topic 0 can be interpreted as a New Renewable Energy Initiative implemented by Barack Obama to tackle climate change that can be found in news based on some research done. 

**Topic 1**
- 'un' represents the abbreviation of United Nations and the link between 'un' and other words such as 'great', 'issue', 'environmental' and 'public' can lead to the interpretation of Topic 1 as an Environment Programme by United Nations to raise awareness among the public to help in environmental issues. 

**Topic 2**
- As words such as 'u.s.', 'help' and 'india' are distributed to Topic 2, a topic about an opportunity for the United States and India to cooperate in tackling climate change can be interpreted and at the same time, this topic could be a new topic reported by scientists.

**Topic 3**
- The words distributed to Topic 3 such as 'bill', 'graham', 'law', 'senate' and 'senator' show that Topic 3 could be related to news about law and a bill which has been stopped being supported by a senator named Graham.

**Topic 4**
- Based on some research done, 'al' and 'gore' appeared to be the name of an environmetalist (Al Gore). Moreover, words such as 'snow', 'world' and 'storm' could be related to climate change which link to a documentary (An Inconvenient Truth) made by the environmentalist.

**Analysis of Topic Distribution**

In [None]:
# Call the ldamodel get_document_topics function here and assign the result to document_topics[]

document_topics = ldamodel.get_document_topics(corpus_bag_of_words)
for i in range(0,10):
    print("\n Document :", i)
    print(document_topics[i])
    

**Compute Perplexity and Coherence Score** 

In [None]:
from gensim.models import CoherenceModel

In [None]:
# Compute Perplexity

log_perplexity = ldamodel.log_perplexity(corpus_bag_of_words)
perplexity = 2**(-log_perplexity)
print('Perplexity: ',perplexity)

# Compute Coherence Score

coherence_model_lda = CoherenceModel(model=ldamodel, texts=processed_tweets_token, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ',coherence_lda)
