<a href="https://colab.research.google.com/github/lillanieder/data-projects/blob/main/2_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

In [None]:
import pandas as pd

df = pd.read_csv("tweet_data.csv")
tweets = [tweet.lower() for tweet in df.tweet_retweet.drop_duplicates().to_list()]

# Tokenization

In [None]:
import nltk
nltk.download('stopwords')

from nltk import TweetTokenizer
from nltk.corpus import stopwords
import string

# Creating token
# [[token for token in tweet.split()] for tweet in tweets] 
raw_tokenized_docs = [TweetTokenizer().tokenize(doc) for doc in tweets]

# Preprocessing tokens: eliminating stopwords and punctuation
# from gensim.utils import simple_preprocess
# tokenized_docs = [simple_preprocess(str(doc), deacc=True) for doc in raw_tokenized_docs]
stop_words = stopwords.words('english')
stop_words.extend(['via'])
tokenized_docs =\
[[word.lower() for word in doc 
  if word not in stop_words and
     word not in string.punctuation+'...' and
     '•' not in word]
 for doc in raw_tokenized_docs]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Analysis

In [None]:
from gensim.corpora import Dictionary

from collections import defaultdict
import itertools

# Creating dictionary and the bag-of-words (Corpus)
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[dictionary.get(word_id)] += word_count

# Create a sorted list from the defaultdict: sorted_word_count 
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word, word_count in sorted_word_count[:10]:
    print(word, word_count)

climate 3128
change 3027
global 2822
warming 2764
rt 902
link 751
new 310
#climate 224
snow 223
#tcot 222


In [None]:
import gensim

bigram = gensim.models.Phrases(tokenized_docs, min_count=10, threshold=5)

bigram_mod = gensim.models.phrases.Phraser(bigram)

bigrams = [bigram_mod[doc] for doc in tokenized_docs]
bi_dictionary = Dictionary(bigrams)
bi_corpus = [bi_dictionary.doc2bow(doc) for doc in bigrams]

# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(bi_corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count 
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:20]:
    print(bi_dictionary.get(word_id), word_count)

climate_change 2624
global_warming 2477
rt 873
link 751
climate 330
change 211
new 203
warming 189
snow 185
science 179
bill 169
#tcot 162
us 160
global 157
report 149
news 148
energy 147
#climate_change 146
a_ 136
dc 122


In [None]:
from pprint import pprint
from gensim.models.ldamodel import LdaModel

lda_model = LdaModel(corpus=bi_corpus, id2word=bi_dictionary, num_topics=5, random_state=1, 
                     update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.036*"global_warming" + 0.010*"weather" + 0.010*"un" + 0.010*"snow" + '
  '0.008*"volcanoes" + 0.008*"think" + 0.006*"times" + 0.006*"would" + '
  '0.006*"cause" + 0.006*"make"'),
 (1,
  '0.113*"global_warming" + 0.037*"rt" + 0.014*"climate" + 0.014*"change" + '
  '0.012*"may" + 0.011*"new" + 0.011*"birds" + 0.011*"bats" + 0.011*"#tcot" + '
  '0.011*"lizards"'),
 (2,
  '0.015*"science" + 0.012*"bill" + 0.011*"scientists" + 0.011*"ice" + '
  '0.009*"global_warming" + 0.009*"one" + 0.007*"get" + 0.007*"obama" + '
  '0.007*"hoax" + 0.006*"5"'),
 (3,
  '0.047*"climate_change" + 0.016*"global" + 0.009*"important" + 0.008*"says" '
  '+ 0.007*"questions" + 0.006*"people" + 0.005*"@newsongreen" + 0.005*"first" '
  '+ 0.005*"like" + 0.005*"earth_day"'),
 (4,
  '0.132*"climate_change" + 0.010*"news" + 0.010*"energy" + 0.009*"us" + '
  '0.008*"green" + 0.008*"report" + 0.007*"warming" + 0.006*"role" + '
  '0.006*"say" + 0.006*"cl"')]


In [None]:
from gensim.models import TfidfModel

tfidf = TfidfModel(bi_corpus, smartirs='ntc')

tfidf_weights = defaultdict(float)
for doc in tfidf[bi_corpus]:
  for id, freq in doc:
    tfidf_weights[bi_dictionary[id]] = round(freq,4)

sorted_tfidf_weights = sorted(tfidf_weights.items(), key=lambda w: w[1], reverse=False)
sorted_tfidf_weights[:20]

[('global_warming', 0.0344),
 ('climate_change', 0.0376),
 ('link', 0.0821),
 ('rt', 0.1022),
 ('change', 0.1215),
 ('climate', 0.122),
 ('report', 0.1416),
 ('dc', 0.1461),
 ('science', 0.1493),
 ('u', 0.15),
 ('press', 0.1533),
 ('us', 0.1548),
 ('warming', 0.1579),
 ('talk', 0.1623),
 ('a_', 0.1685),
 ('global', 0.1696),
 ('energy', 0.1708),
 ('fight_climate', 0.1717),
 ('world', 0.1733),
 ('#tcot', 0.174)]