## Preprocessing and Feature Extraction

In [9]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from textblob import TextBlob 

from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import nltk

from argparse import Namespace
from functools import partial
from pathlib import Path
from pprint import pprint

import os
import pickle
import random
import re
import string

Sentiment Analysis:

Steps taken:
- Converting emojis to text
- Lowercase
- We decide to remove all the mentions and hashtagged words, as these will be analysed separately
- Remove Links, as these don't contribute to SA
- Removing Punctuation
- Tokenization
- Stopword filtering
- Stemming
- Conducting the SA on our preprocessed data

In [10]:
df = pd.read_csv("/Users/harveymiller/Documents/GitHub/text-success/Data/tweets_ukraine_monthly.csv")

Creating a new column so that we can see the adjusted tweet and original versiom

In [11]:
df.insert(loc=6,
          column='Adjusted Tweet',
          value=df['rendered_content'])

In [12]:
df.head()

Unnamed: 0,id,date,user,user_followers,raw_content,rendered_content,Adjusted Tweet,likes,retweets,replies,quoteCount,hashtags,lang,media,mentionedUsers
0,1486661338390831105,2022-01-27 11:24:18+00:00,WorldToBe1,10769,#Russia-#Ukraine debate sparks fiery exchange ...,#Russia-#Ukraine debate sparks fiery exchange ...,#Russia-#Ukraine debate sparks fiery exchange ...,0,0,0,0,"['Russia', 'Ukraine', 'CNN', 'USA', 'EU', 'NAT...",en,,
1,1486105614803775490,2022-01-25 22:36:03+00:00,embeegle,2246,Can you say pipeline? A larger cut coming to ...,Can you say pipeline? A larger cut coming to ...,Can you say pipeline? A larger cut coming to ...,1,1,0,0,"['PutinsPuppet', 'ukrainewar']",en,,
2,1486056169013661697,2022-01-25 19:19:34+00:00,Vlad_Mykhnenko,1068,Foreign exchange markets somehow are not betti...,Foreign exchange markets somehow are not betti...,Foreign exchange markets somehow are not betti...,1,0,0,0,"['russianinvasion', 'ukrainewar']",en,[Photo(previewUrl='https://pbs.twimg.com/media...,
3,1486019310069989376,2022-01-25 16:53:07+00:00,RaVe_74,5309,#borisjohnson's expertise in foreign affairs -...,#borisjohnson's expertise in foreign affairs -...,#borisjohnson's expertise in foreign affairs -...,3,1,0,0,"['borisjohnson', 'freenazanin', 'BrexitShamble...",en,,
4,1485989417084985347,2022-01-25 14:54:20+00:00,miamicool,1035,"Seems that #ukrainewar just became that ""line ...","Seems that #ukrainewar just became that ""line ...","Seems that #ukrainewar just became that ""line ...",0,0,0,0,['ukrainewar'],en,,


Converting emojis to text

In [13]:
import emot
import emoji

def demote(text):
    text = emoji.demojize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(demote)

Changing all text to lowercase

In [14]:
def lowercase(text):    
    text = text.lower()
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(lowercase)

Removing mentions and hashtagged words

In [15]:
import re

def remove_mentions_hashtags(text):
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_mentions_hashtags)

Removing links

In [16]:
import re

def remove_links(text):
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'bit.ly/\S+', '', text) # remove bitly links
    text = text.strip('[link]') # remove [links]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_links)

Removing all Punctuation

In [17]:
import string

def punctuation_remove(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(punctuation_remove)

Tokenizing

In [18]:
from nltk import word_tokenize

def tokenize(text):
    text = word_tokenize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(tokenize)

Stopword Filtering

In [19]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stop_words]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_stopwords)

Stemming

In [20]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def stem(text):
    stemmed = [porter.stem(word) for word in text]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(stem)

Detokenization

In [30]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

def detokenize(text):
    TreebankWordDetokenizer().detokenize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(detokenize)

Insert Polarity Score Column

In [None]:
df.insert(loc=7,
          column='Polarity Score',
          value=df['Adjusted Tweet'])

In [34]:
df.head()

Unnamed: 0,id,date,user,user_followers,raw_content,rendered_content,Adjusted Tweet,Polarity Score,likes,retweets,replies,quoteCount,hashtags,lang,media,mentionedUsers
0,1486661338390831105,2022-01-27 11:24:18+00:00,WorldToBe1,10769,#Russia-#Ukraine debate sparks fiery exchange ...,#Russia-#Ukraine debate sparks fiery exchange ...,"[debate, sparks, fiery, exchange, 2014, backed...","[debate, sparks, fiery, exchange, 2014, backed...",0,0,0,0,"['Russia', 'Ukraine', 'CNN', 'USA', 'EU', 'NAT...",en,,
1,1486105614803775490,2022-01-25 22:36:03+00:00,embeegle,2246,Can you say pipeline? A larger cut coming to ...,Can you say pipeline? A larger cut coming to ...,"[say, pipeline, larger, cut, coming, cost]","[say, pipeline, larger, cut, coming, cost]",1,1,0,0,"['PutinsPuppet', 'ukrainewar']",en,,
2,1486056169013661697,2022-01-25 19:19:34+00:00,Vlad_Mykhnenko,1068,Foreign exchange markets somehow are not betti...,Foreign exchange markets somehow are not betti...,"[foreign, exchange, markets, somehow, betting,...","[foreign, exchange, markets, somehow, betting,...",1,0,0,0,"['russianinvasion', 'ukrainewar']",en,[Photo(previewUrl='https://pbs.twimg.com/media...,
3,1486019310069989376,2022-01-25 16:53:07+00:00,RaVe_74,5309,#borisjohnson's expertise in foreign affairs -...,#borisjohnson's expertise in foreign affairs -...,"[expertise, foreign, affairs, trying, managed,...","[expertise, foreign, affairs, trying, managed,...",3,1,0,0,"['borisjohnson', 'freenazanin', 'BrexitShamble...",en,,
4,1485989417084985347,2022-01-25 14:54:20+00:00,miamicool,1035,"Seems that #ukrainewar just became that ""line ...","Seems that #ukrainewar just became that ""line ...","[seems, became, line, sand, whole, world, war,...","[seems, became, line, sand, whole, world, war,...",0,0,0,0,['ukrainewar'],en,,


Sentiment Analysis using NLTK's VADER

In [33]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def sentiment_analysis(text):  
    text = sia.polarity_scores(text)
    return text

df['Polarity Score'] = df['Polarity Score'].apply(sentiment_analysis)
    

AttributeError: 'list' object has no attribute 'encode'