## Preprocessing and Feature Extraction

In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from textblob import TextBlob 

from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import nltk

from argparse import Namespace
from functools import partial
from pathlib import Path
from pprint import pprint

import os
import pickle
import random
import re
import string

## Sentiment Analysis:

Steps taken:
- Converting emojis to text
- We decide to remove all the mentions and hashtagged words, as these will be analysed separately
- Remove Links, as these don't contribute to SA
- Conducting the SA on our preprocessed data

In [2]:
df = pd.read_csv("/Users/harveymiller/Documents/GitHub/text-success/Data/tweets_ukraine_monthly.csv")

Creating a new column so that we can see the adjusted tweet and original versiom

In [3]:
df.insert(loc=6,
          column='Adjusted Tweet',
          value=df['rendered_content'])

Converting emojis to text

In [4]:
import emot
import emoji

def demote(text):
    text = emoji.demojize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(demote)

Removing mentions and hashtagged words

In [5]:
import re

def remove_mentions_hashtags(text):
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_mentions_hashtags)

Removing links

In [6]:
import re

def remove_links(text):
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'bit.ly/\S+', '', text) # remove bitly links
    text = text.strip('[link]') # remove [links]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_links)

Insert Polarity Score Column

In [7]:
df.insert(loc=7,
          column='Polarity Score',
          value=df['Adjusted Tweet'])

Sentiment Analysis using NLTK's VADER

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def sentiment_analysis(text):  
    text = sia.polarity_scores(text)
    return text

df['Polarity Score'] = df['Polarity Score'].apply(sentiment_analysis)
    

Creating columns for:
- Negative Score
- Neutral Score
- Positive Score
- Compound Score [-1,1]

In [9]:
df.insert(loc=8,
          column='Negative Score',
          value=df['Polarity Score'])

df.insert(loc=9,
          column='Neutral Score',
          value=df['Polarity Score'])

df.insert(loc=10,
          column='Positive Score',
          value=df['Polarity Score'])

df.insert(loc=11,
          column='Compound Score',
          value=df['Polarity Score'])

In [10]:
def negative_score(text):
    text = text['neg']
    return text

df['Negative Score'] = df['Negative Score'].apply(negative_score)

In [11]:
def neutral_score(text):
    text = text['neu']
    return text

df['Neutral Score'] = df['Neutral Score'].apply(neutral_score)

In [12]:
def positive_score(text):
    text = text['pos']
    return text

df['Positive Score'] = df['Positive Score'].apply(positive_score)

In [13]:
def compound_score(text):
    text = text['compound']
    return text

df['Compound Score'] = df['Compound Score'].apply(compound_score)

In [None]:
df.head()

Sentiment Analysis using TextBlob

In [25]:
df.insert(loc=12,
          column='Polarity Score_textblob',
          value=df['Adjusted Tweet'])

In [None]:
df.insert(loc=13,
          column='Subjectivity Score_textblob',
          value=df['Adjusted Tweet'])

In [33]:
from textblob import TextBlob

#Create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

#df['Polarity Score_textblob'] = df['Polarity Score_textblob'].apply(getPolarity)
df['Subjectivity Score_textblob'] = df['Subjectivity Score_textblob'].apply(getSubjectivity)


In [35]:
df.head()

Unnamed: 0,id,date,user,user_followers,raw_content,rendered_content,Adjusted Tweet,Polarity Score,Negative Score,Neutral Score,...,Polarity Score_textblob,Subjectivity Score_textblob,likes,retweets,replies,quoteCount,hashtags,lang,media,mentionedUsers
0,1486661338390831105,2022-01-27 11:24:18+00:00,WorldToBe1,10769,#Russia-#Ukraine debate sparks fiery exchange ...,#Russia-#Ukraine debate sparks fiery exchange ...,"['debate', 'sparks', 'fiery', 'exchange', '201...","{'neg': 0.094, 'neu': 0.906, 'pos': 0.0, 'comp...",0.094,0.906,...,0.0,0.0,0,0,0,0,"['Russia', 'Ukraine', 'CNN', 'USA', 'EU', 'NAT...",en,,
1,1486105614803775490,2022-01-25 22:36:03+00:00,embeegle,2246,Can you say pipeline? A larger cut coming to ...,Can you say pipeline? A larger cut coming to ...,"['say', 'pipeline', 'larger', 'cut', 'coming',...","{'neg': 0.174, 'neu': 0.826, 'pos': 0.0, 'comp...",0.174,0.826,...,0.0,0.5,1,1,0,0,"['PutinsPuppet', 'ukrainewar']",en,,
2,1486056169013661697,2022-01-25 19:19:34+00:00,Vlad_Mykhnenko,1068,Foreign exchange markets somehow are not betti...,Foreign exchange markets somehow are not betti...,"['foreign', 'exchange', 'markets', 'somehow', ...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,...,-0.125,0.125,1,0,0,0,"['russianinvasion', 'ukrainewar']",en,[Photo(previewUrl='https://pbs.twimg.com/media...,
3,1486019310069989376,2022-01-25 16:53:07+00:00,RaVe_74,5309,#borisjohnson's expertise in foreign affairs -...,#borisjohnson's expertise in foreign affairs -...,"['expertise', 'foreign', 'affairs', 'trying', ...","{'neg': 0.215, 'neu': 0.785, 'pos': 0.0, 'comp...",0.215,0.785,...,-0.125,0.125,3,1,0,0,"['borisjohnson', 'freenazanin', 'BrexitShamble...",en,,
4,1485989417084985347,2022-01-25 14:54:20+00:00,miamicool,1035,"Seems that #ukrainewar just became that ""line ...","Seems that #ukrainewar just became that ""line ...","['seems', 'became', 'line', 'sand', 'whole', '...","{'neg': 0.401, 'neu': 0.599, 'pos': 0.0, 'comp...",0.401,0.599,...,-0.4,0.525,0,0,0,0,['ukrainewar'],en,,


## Further manipulating the tweet

Steps taken:
- Lowercase
- Punctuation
- Tokenization
- Stopword filtering
- Stemming

Changing all text to lowercase

In [15]:
def lowercase(text):    
    text = text.lower()
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(lowercase)

Removing all Punctuation

In [16]:
import string

def punctuation_remove(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(punctuation_remove)

Tokenizing

In [17]:
from nltk import word_tokenize

def tokenize(text):
    text = word_tokenize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(tokenize)

Stopword Filtering

In [18]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stop_words]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_stopwords)

Stemming

In [19]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def stem(text):
    stemmed = [porter.stem(word) for word in text]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(stem)

In [22]:
df.head()

Unnamed: 0,id,date,user,user_followers,raw_content,rendered_content,Adjusted Tweet,Polarity Score,Negative Score,Neutral Score,Positive Score,Compound Score,likes,retweets,replies,quoteCount,hashtags,lang,media,mentionedUsers
0,1486661338390831105,2022-01-27 11:24:18+00:00,WorldToBe1,10769,#Russia-#Ukraine debate sparks fiery exchange ...,#Russia-#Ukraine debate sparks fiery exchange ...,"['debate', 'sparks', 'fiery', 'exchange', '201...","{'neg': 0.094, 'neu': 0.906, 'pos': 0.0, 'comp...",0.094,0.906,0.0,-0.34,0,0,0,0,"['Russia', 'Ukraine', 'CNN', 'USA', 'EU', 'NAT...",en,,
1,1486105614803775490,2022-01-25 22:36:03+00:00,embeegle,2246,Can you say pipeline? A larger cut coming to ...,Can you say pipeline? A larger cut coming to ...,"['say', 'pipeline', 'larger', 'cut', 'coming',...","{'neg': 0.174, 'neu': 0.826, 'pos': 0.0, 'comp...",0.174,0.826,0.0,-0.2732,1,1,0,0,"['PutinsPuppet', 'ukrainewar']",en,,
2,1486056169013661697,2022-01-25 19:19:34+00:00,Vlad_Mykhnenko,1068,Foreign exchange markets somehow are not betti...,Foreign exchange markets somehow are not betti...,"['foreign', 'exchange', 'markets', 'somehow', ...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,1,0,0,0,"['russianinvasion', 'ukrainewar']",en,[Photo(previewUrl='https://pbs.twimg.com/media...,
3,1486019310069989376,2022-01-25 16:53:07+00:00,RaVe_74,5309,#borisjohnson's expertise in foreign affairs -...,#borisjohnson's expertise in foreign affairs -...,"['expertise', 'foreign', 'affairs', 'trying', ...","{'neg': 0.215, 'neu': 0.785, 'pos': 0.0, 'comp...",0.215,0.785,0.0,-0.8172,3,1,0,0,"['borisjohnson', 'freenazanin', 'BrexitShamble...",en,,
4,1485989417084985347,2022-01-25 14:54:20+00:00,miamicool,1035,"Seems that #ukrainewar just became that ""line ...","Seems that #ukrainewar just became that ""line ...","['seems', 'became', 'line', 'sand', 'whole', '...","{'neg': 0.401, 'neu': 0.599, 'pos': 0.0, 'comp...",0.401,0.599,0.0,-0.9732,0,0,0,0,['ukrainewar'],en,,


# Topic modelling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['adjusted_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()