## Preprocessing and Feature Extraction

In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from textblob import TextBlob 

from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import nltk

from argparse import Namespace
from functools import partial
from pathlib import Path
from pprint import pprint

import os
import pickle
import random
import re
import string

In [2]:
df = pd.read_csv("/Users/harveymiller/Documents/GitHub/text-success/Data/Ukraine Tweets.csv")

## Sentiment Analysis:

Steps taken:
- Converting emojis to text
- We decide to remove all the mentions and hashtagged words, as these will be analysed separately
- Remove Links, as these don't contribute to SA
- Conducting the SA on our preprocessed data

Creating a new column so that we can see the adjusted tweet and original versiom

In [3]:
df.insert(loc=6,
          column='Adjusted Tweet',
          value=df['rendered_content'])

Converting emojis to text

In [4]:
import emot
import emoji

def demote(text):
    text = emoji.demojize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(demote)

Removing mentions and hashtagged words

In [5]:
import re

def remove_mentions_hashtags(text):
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_mentions_hashtags)

Removing links

In [6]:
import re

def remove_links(text):
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'bit.ly/\S+', '', text) # remove bitly links
    text = text.strip('[link]') # remove [links]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_links)

Insert Polarity Score Column

In [7]:
df.insert(loc=7,
          column='Polarity Score',
          value=df['Adjusted Tweet'])

Sentiment Analysis using NLTK's VADER

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def sentiment_analysis(text):  
    text = sia.polarity_scores(text)
    return text

df['Polarity Score'] = df['Polarity Score'].apply(sentiment_analysis)
    

Creating columns for:
- Negative Score
- Neutral Score
- Positive Score
- Compound Score [-1,1]

In [9]:
df.insert(loc=8,
          column='Negative Score',
          value=df['Polarity Score'])

df.insert(loc=9,
          column='Neutral Score',
          value=df['Polarity Score'])

df.insert(loc=10,
          column='Positive Score',
          value=df['Polarity Score'])

df.insert(loc=11,
          column='Compound Score',
          value=df['Polarity Score'])

In [10]:
def negative_score(text):
    text = text['neg']
    return text

df['Negative Score'] = df['Negative Score'].apply(negative_score)

In [11]:
def neutral_score(text):
    text = text['neu']
    return text

df['Neutral Score'] = df['Neutral Score'].apply(neutral_score)

In [12]:
def positive_score(text):
    text = text['pos']
    return text

df['Positive Score'] = df['Positive Score'].apply(positive_score)

In [13]:
def compound_score(text):
    text = text['compound']
    return text

df['Compound Score'] = df['Compound Score'].apply(compound_score)

In [14]:
df.head()

Unnamed: 0,id,date,user,user_followers,user_created,rendered_content,Adjusted Tweet,Polarity Score,Negative Score,Neutral Score,Positive Score,Compound Score,likes,retweets,replies,quoteCount,hashtags,lang,media,mentionedUsers
0,1477420789863436289,2022-01-01 23:25:40+00:00,anno1540,8838,2014-06-12 17:05:22+00:00,"Lithuania will never abandon Ukraine, voluntee...","Lithuania will never abandon Ukraine, voluntee...","{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'comp...",0.0,0.661,0.339,0.5583,5,1,0,0,"['Lithuania', 'Ukraine']",en,,
1,1477414596424220679,2022-01-01 23:01:03+00:00,weather_odessa,119,2019-07-10 08:34:22+00:00,#odessa #odesa #ukraine #одесса\nNow: 4.2°C\nT...,#одесса\nNow: 4.2°C\nToday's Min: 4.2°C at ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0,0,0,0,0,"['odessa', 'odesa', 'ukraine', 'одесса']",en,,
2,1477414332376010752,2022-01-01 23:00:00+00:00,AlArabiya_Eng,927174,2009-02-28 08:31:32+00:00,After tough talk between Presidents Joe Biden ...,After tough talk between Presidents Joe Biden ...,"{'neg': 0.099, 'neu': 0.776, 'pos': 0.125, 'co...",0.099,0.776,0.125,0.2732,4,0,3,0,"['Russia', 'Ukraine']",en,,
3,1477409748572151809,2022-01-01 22:41:47+00:00,beatravelling,6329,2014-02-28 21:25:33+00:00,The beach can be nice in the fall too 😊🇺🇦\n\n#...,The beach can be nice in the fall too :smiling...,"{'neg': 0.0, 'neu': 0.781, 'pos': 0.219, 'comp...",0.0,0.781,0.219,0.4215,0,0,0,0,"['lanzheron', 'langeron', 'beach', 'odessa', '...",en,,
4,1477409332820119552,2022-01-01 22:40:08+00:00,TornCurtain1991,677,2012-02-08 15:30:41+00:00,"A note: Stepan #Bandera, DOB 01011909, was lea...","A note: Stepan , DOB 01011909, was leader of O...","{'neg': 0.171, 'neu': 0.829, 'pos': 0.0, 'comp...",0.171,0.829,0.0,-0.7783,1,2,0,0,"['Bandera', 'Ukraine']",en,,


Sentiment Analysis using TextBlob

In [15]:
df.insert(loc=12,
          column='Polarity Score_textblob',
          value=df['Adjusted Tweet'])

In [16]:
df.insert(loc=13,
          column='Subjectivity Score_textblob',
          value=df['Adjusted Tweet'])

In [17]:
from textblob import TextBlob

#Create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

df['Polarity Score_textblob'] = df['Polarity Score_textblob'].apply(getPolarity)
df['Subjectivity Score_textblob'] = df['Subjectivity Score_textblob'].apply(getSubjectivity)


In [18]:
df.head()

Unnamed: 0,id,date,user,user_followers,user_created,rendered_content,Adjusted Tweet,Polarity Score,Negative Score,Neutral Score,...,Polarity Score_textblob,Subjectivity Score_textblob,likes,retweets,replies,quoteCount,hashtags,lang,media,mentionedUsers
0,1477420789863436289,2022-01-01 23:25:40+00:00,anno1540,8838,2014-06-12 17:05:22+00:00,"Lithuania will never abandon Ukraine, voluntee...","Lithuania will never abandon Ukraine, voluntee...","{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'comp...",0.0,0.661,...,0.0,0.0,5,1,0,0,"['Lithuania', 'Ukraine']",en,,
1,1477414596424220679,2022-01-01 23:01:03+00:00,weather_odessa,119,2019-07-10 08:34:22+00:00,#odessa #odesa #ukraine #одесса\nNow: 4.2°C\nT...,#одесса\nNow: 4.2°C\nToday's Min: 4.2°C at ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,...,0.0,0.0,0,0,0,0,"['odessa', 'odesa', 'ukraine', 'одесса']",en,,
2,1477414332376010752,2022-01-01 23:00:00+00:00,AlArabiya_Eng,927174,2009-02-28 08:31:32+00:00,After tough talk between Presidents Joe Biden ...,After tough talk between Presidents Joe Biden ...,"{'neg': 0.099, 'neu': 0.776, 'pos': 0.125, 'co...",0.099,0.776,...,-0.194444,0.666667,4,0,3,0,"['Russia', 'Ukraine']",en,,
3,1477409748572151809,2022-01-01 22:41:47+00:00,beatravelling,6329,2014-02-28 21:25:33+00:00,The beach can be nice in the fall too 😊🇺🇦\n\n#...,The beach can be nice in the fall too :smiling...,"{'neg': 0.0, 'neu': 0.781, 'pos': 0.219, 'comp...",0.0,0.781,...,0.6,1.0,0,0,0,0,"['lanzheron', 'langeron', 'beach', 'odessa', '...",en,,
4,1477409332820119552,2022-01-01 22:40:08+00:00,TornCurtain1991,677,2012-02-08 15:30:41+00:00,"A note: Stepan #Bandera, DOB 01011909, was lea...","A note: Stepan , DOB 01011909, was leader of O...","{'neg': 0.171, 'neu': 0.829, 'pos': 0.0, 'comp...",0.171,0.829,...,-0.1,0.033333,1,2,0,0,"['Bandera', 'Ukraine']",en,,


## Further manipulating the tweet

Steps taken:
- Lowercase
- Punctuation
- Tokenization
- Stopword filtering
- Stemming

Changing all text to lowercase

In [19]:
def lowercase(text):    
    text = text.lower()
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(lowercase)

Removing all Punctuation

In [20]:
import string

def punctuation_remove(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(punctuation_remove)

Tokenizing

In [21]:
from nltk import word_tokenize

def tokenize(text):
    text = word_tokenize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(tokenize)

Stopword Filtering

In [22]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stop_words]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_stopwords)

Stemming

In [23]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def stem(text):
    stemmed = [porter.stem(word) for word in text]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(stem)

## Months since creation of account relative to tweet

In [25]:
#Finding date of account creation in months
df.insert(loc=5,
          column='Date of Creation in months',
          value=df['user_created'])

In [26]:
from datetime import *

#returning the months of account creation
def account_creation(text):
    text = datetime.strptime(text, "%Y-%m-%d %H:%M:%S+00:00")
    year = str(text)[0:4]
    month = str(text)[5:7]
    total_months = (int(year)*12)+(int(month))
    return (total_months)

df['Date of Creation in months'] = df['Date of Creation in months'].apply(account_creation)

In [27]:
#Finding date of tweet in months
df.insert(loc=6,
          column='Date of Tweet in Months',
          value=df['date'])

In [28]:
from datetime import *

#return the year of tweet
df['Date of Tweet in Months'] = df['Date of Tweet in Months'].apply(account_creation)

In [29]:
#Calculating Months since creation of account relative to tweet
df.insert(loc=7,
          column='Months Since Creation of Account',
          value= (df['Date of Tweet in Months']-df['Date of Creation in months']))

## Time of Day

In [30]:
#create a column for hours:
df.insert(loc=2,
          column='hour of tweet',
          value=df['date'])

In [31]:
from datetime import *

#return the hour of the tweet
def hour(text):
    text = datetime.strptime(text, "%Y-%m-%d %H:%M:%S+00:00")
    hour = str(text.time())[0:2]
    return int(hour)

df['hour of tweet'] = df['hour of tweet'].apply(hour)

In [32]:
#insert column for the time of day
df.insert(loc=3,
          column='time of day',
          value=df['hour of tweet'])

In [33]:
def time_of_day(text):  
    if ((text < 4) & (text > 8 )):
        return 'Early Morning'
    elif ((text < 8) & (text > 12 )):
        return 'Morning'
    elif ((text < 12) & (text > 16 )):
        return 'Noon'
    elif ((text < 16) & (text > 20 )):
        return 'Eve'
    elif ((text < 20) & (text > 24 )):
        return 'Night'
    elif ((text < 24) & (text > 4 )):
        return 'Late Night'
    

df['time of day'] = df['time of day'].apply(time_of_day)


## Video, GIF and Photo Count

Photo Count

In [35]:
#Creating a photo count column...
df.insert(loc=23,
          column='Photo Count',
          value=df['media'])

In [36]:
#Counting number of Photos in media column
#No need to tokenize
def photo_count(text):
    text = str(text)
    text = text.count('Photo')
    return text

df['Photo Count'] = df['Photo Count'].apply(photo_count)


Video Count

In [37]:
#Creating a video count column...
df.insert(loc=24,
          column='Video Count',
          value=df['media'])

In [38]:
#We need to tokenize the media column so that we can count how many videos there are...
from nltk import word_tokenize

def tokenize(text):
    text = str(text)
    text = word_tokenize(text)
    return text

df['Video Count'] = df['Video Count'].apply(tokenize)

In [39]:
#Counting number of Videos in media column
def video_count(text):
    text = text.count('Video')
    return text

df['Video Count'] = df['Video Count'].apply(video_count)

Gif Count

In [40]:
#Creating a GIF count column...
df.insert(loc=25,
          column='GIF Count',
          value=df['media'])

In [41]:
#We need to tokenize the media column so that we can count how many GIFs there are...
from nltk import word_tokenize

def tokenize(text):
    text = str(text)
    text = word_tokenize(text)
    return text

df['GIF Count'] = df['GIF Count'].apply(tokenize)

In [42]:
#Counting number of GIFs in media column
def gif_count(text):
    text = text.count('Gif')
    return text

df['GIF Count'] = df['GIF Count'].apply(gif_count)

## Creating our final Dataframe

Steps taken:
- Drop: media, date, user, polarity score, user_created, Date of Creation in months, Date of Tweet in Months, rendered_content.	

In [46]:
df.drop(['media','date','user','Polarity Score','user_created','Date of Creation in months','Date of Tweet in Months','rendered_content'], axis=1)

Unnamed: 0,id,hour of tweet,time of day,user_followers,Months Since Creation of Account,Adjusted Tweet,Negative Score,Neutral Score,Positive Score,Compound Score,...,likes,retweets,replies,quoteCount,Photo Count,Video Count,GIF Count,hashtags,lang,mentionedUsers
0,1477420789863436289,23,Late Night,8838,91,"[lithuania, never, abandon, ukraine, volunteer...",0.000,0.661,0.339,0.5583,...,5,1,0,0,0,0,0,"['Lithuania', 'Ukraine']",en,
1,1477414596424220679,23,Late Night,119,30,"[одесса, 42°c, todays, min, 42°c, 002852, toda...",0.000,1.000,0.000,0.0000,...,0,0,0,0,0,0,0,"['odessa', 'odesa', 'ukraine', 'одесса']",en,
2,1477414332376010752,23,Late Night,927174,155,"[tough, talk, presidents, joe, biden, vladimir...",0.099,0.776,0.125,0.2732,...,4,0,3,0,0,0,0,"['Russia', 'Ukraine']",en,
3,1477409748572151809,22,Late Night,6329,95,"[beach, nice, fall, smilingfacewithsmilingeyes...",0.000,0.781,0.219,0.4215,...,0,0,0,0,0,0,0,"['lanzheron', 'langeron', 'beach', 'odessa', '...",en,
4,1477409332820119552,22,Late Night,677,119,"[note, stepan, dob, 01011909, leader, organiza...",0.171,0.829,0.000,-0.7783,...,1,2,0,0,0,0,0,"['Bandera', 'Ukraine']",en,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60128,1609034657231675393,3,,2559,101,"[announces, strengthen, armed, –, happened, uk...",0.000,0.813,0.187,0.3182,...,0,0,1,0,0,0,0,"['UkraineRussiaWar️', 'Russia', 'Ukraine', 'wa...",en,
60129,1609032640664838145,3,,2250,114,"[living, govt, bombs, neighbouring, country, s...",0.000,1.000,0.000,0.0000,...,1,0,0,0,0,0,0,"['Venezuela', 'RussiaUkraineWar', 'Ukraine️', ...",en,
60130,1609025141333438465,3,,69667,173,"[alexander, mercouris, comments, ray, mcgovern...",0.000,1.000,0.000,0.0000,...,89,47,2,1,2,0,0,"['Russia', 'Ukraine', 'NATO', 'Lavrov', 'Ukrai...",en,
60131,1609020670486405121,2,,299,138,"[nice, overview, capabilities, bradley, ifvs, ...",0.000,0.823,0.177,0.4215,...,1,1,0,0,0,0,0,"['Ukraine️', 'Ukrainian', 'UkraineRussiaWar️',...",en,
