## Preprocessing and Feature Extraction

In [96]:
import pandas as pd
import numpy as np

import snscrape.modules.twitter as sntwitter

import nltk
from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import collections
from argparse import Namespace
from functools import partial
from pathlib import Path
from pprint import pprint

import os
import pickle
import random
import re
import string

In [97]:
df = pd.read_csv("https://raw.githubusercontent.com/lifeonthefence/text-success/main/Data/Ukraine%20Tweets.csv")

## Initial Preprocessing

Check for duplicates

In [98]:
df = df.drop_duplicates(subset = 'id')

Precounting of features: Length, Hashtags, URLs and Mentions

In [99]:
df['tweet_length'] = df['rendered_content'].apply(len)

In [100]:
df['num_mentions'] = df['rendered_content'].apply(lambda x: x.count('@'))

In [101]:
df['num_hashtags'] = df['rendered_content'].apply(lambda x: x.count('#'))

In [102]:
df['num_urls'] = df['rendered_content'].apply(lambda x: x.count('https'))

Steps taken:
- Converting emojis to text
- We decide to remove all the mentions and hashtagged words, as these will be analysed separately
- Remove Links, as these don't contribute to SA
- Conducting the SA on our preprocessed data

Creating a new column so that we can see the adjusted tweet and original versiom

In [103]:
df.insert(loc=6,
          column='Adjusted Tweet',
          value=df['rendered_content'])

Converting emojis to text

In [104]:
import emoji

def demote(text):
    text = emoji.demojize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(demote)

Removing mentions and hashtagged words

In [105]:
import re

def remove_mentions_hashtags(text):
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_mentions_hashtags)

Removing links

In [106]:
def remove_links(text):
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'bit.ly/\S+', '', text) # remove bitly links
    text = text.strip('[link]') # remove [links]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_links)

In [107]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [108]:
nltk.download('punkt')

SENT_DETECTOR = nltk.data.load('tokenizers/punkt/english.pickle')

! python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [109]:
pip install NLP-python

Note: you may need to restart the kernel to use updated packages.


Removing Numbers

In [111]:
df['Adjusted Tweet'] = df['Adjusted Tweet'].str.replace('\d+', '')

  df['Adjusted Tweet'] = df['Adjusted Tweet'].str.replace('\d+', '')


Removing Non-English Characters, Accents and Remaining Punctuation 

In [112]:
df['Adjusted Tweet'] = df['Adjusted Tweet'].str.replace(r'[^\x00-\x7F]+', '')

  df['Adjusted Tweet'] = df['Adjusted Tweet'].str.replace(r'[^\x00-\x7F]+', '')


## Sentiment Analysis:

Insert Polarity Score Column

In [113]:
df.insert(loc=7,
          column='Polarity Score',
          value=df['Adjusted Tweet'])

Sentiment Analysis using NLTK's VADER

In [114]:
### Uncomment to download lexicon for the first time 
#import nltk
#nltk.download('vader_lexicon')

In [115]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def sentiment_analysis(text):  
    text = sia.polarity_scores(text)
    return text

df['Polarity Score'] = df['Polarity Score'].apply(sentiment_analysis)
    

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Creating columns for:
- Negative Score
- Neutral Score
- Positive Score
- Compound Score [-1,1]

In [116]:
df.insert(loc=8,
          column='Negative Score',
          value=df['Polarity Score'])

df.insert(loc=9,
          column='Neutral Score',
          value=df['Polarity Score'])

df.insert(loc=10,
          column='Positive Score',
          value=df['Polarity Score'])

df.insert(loc=11,
          column='Compound Score',
          value=df['Polarity Score'])

df['Negative Score'] = df['Negative Score'].apply(lambda x: x['neg'])
df['Neutral Score'] = df['Neutral Score'].apply(lambda x: x['neu'])
df['Positive Score'] = df['Positive Score'].apply(lambda x: x['pos'])
df['Compound Score'] = df['Compound Score'].apply(lambda x: x['compound'])

In [117]:
df.head()

Unnamed: 0,id,date,user,user_followers,user_created,rendered_content,Adjusted Tweet,Polarity Score,Negative Score,Neutral Score,...,replies,quoteCount,hashtags,lang,media,mentionedUsers,tweet_length,num_mentions,num_hashtags,num_urls
0,1477420789863436289,2022-01-01 23:25:40+00:00,anno1540,8838,2014-06-12 17:05:22+00:00,"Lithuania will never abandon Ukraine, voluntee...","Lithuania will never abandon Ukraine, voluntee...","{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'comp...",0.0,0.661,...,0,0,"['Lithuania', 'Ukraine']",en,,,132,0,2,0
1,1477414596424220679,2022-01-01 23:01:03+00:00,weather_odessa,119,2019-07-10 08:34:22+00:00,#odessa #odesa #ukraine #одесса\nNow: 4.2°C\nT...,#\nNow: .C\nToday's Min: .C at ::\nToday's ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,...,0,0,"['odessa', 'odesa', 'ukraine', 'одесса']",en,,,188,0,4,0
2,1477414332376010752,2022-01-01 23:00:00+00:00,AlArabiya_Eng,927174,2009-02-28 08:31:32+00:00,After tough talk between Presidents Joe Biden ...,After tough talk between Presidents Joe Biden ...,"{'neg': 0.099, 'neu': 0.776, 'pos': 0.125, 'co...",0.099,0.776,...,3,0,"['Russia', 'Ukraine']",en,,,277,0,2,0
3,1477409748572151809,2022-01-01 22:41:47+00:00,beatravelling,6329,2014-02-28 21:25:33+00:00,The beach can be nice in the fall too 😊🇺🇦\n\n#...,The beach can be nice in the fall too :smiling...,"{'neg': 0.0, 'neu': 0.781, 'pos': 0.219, 'comp...",0.0,0.781,...,0,0,"['lanzheron', 'langeron', 'beach', 'odessa', '...",en,,,122,0,5,0
4,1477409332820119552,2022-01-01 22:40:08+00:00,TornCurtain1991,677,2012-02-08 15:30:41+00:00,"A note: Stepan #Bandera, DOB 01011909, was lea...","A note: Stepan , DOB , was leader of Organizat...","{'neg': 0.18, 'neu': 0.82, 'pos': 0.0, 'compou...",0.18,0.82,...,0,0,"['Bandera', 'Ukraine']",en,,,278,0,2,0


Sentiment Analysis

In [118]:
df.insert(loc=12,
          column='Polarity Score_textblob',
          value=df['Adjusted Tweet'])

In [119]:
df.insert(loc=13,
          column='Subjectivity Score_textblob',
          value=df['Adjusted Tweet'])

In [120]:
! pip install -U textblob
! python -m textblob.download_corpora

from textblob import TextBlob

#Create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

df['Polarity Score_textblob'] = df['Polarity Score_textblob'].apply(getPolarity)
df['Subjectivity Score_textblob'] = df['Subjectivity Score_textblob'].apply(getSubjectivity)


[nltk_data] Downloading package brown to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


## Further manipulating the tweet

Steps taken:
- Lowercase
- Punctuation
- Tokenization
- Stopword filtering
- Lemmatisation
- Number removal

Changing all text to lowercase

In [121]:
df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(lambda x: x.lower())

Removing all Punctuation

In [122]:
import string

def punctuation_remove(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(punctuation_remove)

Number removal

In [123]:
def remove_numbers(text):
    no_numbers = re.sub(r'\d+', '', text)
    return no_numbers

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_numbers)

Tokenizing

In [124]:
#nltk.download('punkt')

In [125]:
from nltk import word_tokenize

def tokenize(text):
    text = word_tokenize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(tokenize)

Stopword Filtering

In [126]:
nltk.download('stopwords')
    
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stop_words]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Lemmatisation

In [127]:
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def lemmatise(text): 
    lemma = [wnl.lemmatize(word) for word in text]
    return lemma

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(lemmatise)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zofiachoinska/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Months since creation of account relative to tweet

In [128]:
#Finding date of account creation in months
df.insert(loc=5,
          column='Date of Creation in months',
          value=df['user_created'])

In [129]:
from datetime import *

#returning the months of account creation
def account_creation(text):
    text = datetime.strptime(text, "%Y-%m-%d %H:%M:%S+00:00")
    year = str(text)[0:4]
    month = str(text)[5:7]
    total_months = (int(year)*12)+(int(month))
    return (total_months)

df['Date of Creation in months'] = df['Date of Creation in months'].apply(account_creation)

In [130]:
#Finding date of tweet in months
df.insert(loc=6,
          column='Date of Tweet in Months',
          value=df['date'])

In [131]:
#return the year of tweet
df['Date of Tweet in Months'] = df['Date of Tweet in Months'].apply(account_creation)

In [132]:
#Calculating Months since creation of account relative to tweet
df.insert(loc=7,
          column='Months Since Creation of Account',
          value= (df['Date of Tweet in Months']-df['Date of Creation in months']))

## Time of Day

In [133]:
#create a column for hours:
df.insert(loc=2,
          column='hour of tweet',
          value=df['date'])

In [134]:
from datetime import datetime

#return the hour of the tweet
def hour(text):
    text = datetime.strptime(text, "%Y-%m-%d %H:%M:%S+00:00")
    hour = str(text.time())[0:2]
    return int(hour)

df['hour of tweet'] = df['hour of tweet'].apply(hour)

In [135]:
#insert column for the time of day
df.insert(loc=3,
          column='time of day',
          value=df['hour of tweet'])

In [136]:
#calculating the time of day
def time_of_day(text):  
    if ((text > 4) and (text < 8 )):
        return 'Early Morning'
    elif ((text > 8) and (text < 12 )):
        return 'Morning'
    elif ((text > 12) and (text < 16 )):
        return 'Noon'
    elif ((text > 16) and (text < 20 )):
        return 'Eve'
    elif ((text > 20) and (text < 24 )):
        return 'Night'
    elif ((text > 0) and (text < 4 )):
        return 'Late Night'
    
df['time of day'] = df['time of day'].apply(time_of_day)


In [137]:
#Creating counts using one hot encoding

#Early Morning Count
df.insert(loc=4,
          column='Early Morning Count',
          value=df['time of day'])

def early_morning_count(text):
    if text == 'Early Morning':
        return 1
    else:
        return 0

df['Early Morning Count'] = df['Early Morning Count'].apply(early_morning_count)

#Morning Count
df.insert(loc=5,
          column='Morning Count',
          value=df['time of day'])

def morning_count(text):
    if text == 'Morning':
        return 1
    else:
        return 0

df['Morning Count'] = df['Morning Count'].apply(morning_count)

#Noon count
df.insert(loc=6,
          column='Noon Count',
          value=df['time of day'])

def noon_count(text):
    if text == 'Noon':
        return 1
    else:
        return 0

df['Noon Count'] = df['Noon Count'].apply(noon_count)

#Eve count
df.insert(loc=7,
          column='Eve Count',
          value=df['time of day'])

def eve_count(text):
    if text == 'Eve':
        return 1
    else:
        return 0

df['Eve Count'] = df['Eve Count'].apply(eve_count)

#Night count
df.insert(loc=8,
          column='Night Count',
          value=df['time of day'])

def night_count(text):
    if text == 'Night':
        return 1
    else:
        return 0

df['Night Count'] = df['Night Count'].apply(night_count)

#Late Night count
df.insert(loc=9,
          column='Late Night Count',
          value=df['time of day'])

def late_night_count(text):
    if text == 'Late Night':
        return 1
    else:
        return 0

df['Late Night Count'] = df['Late Night Count'].apply(late_night_count)

## Video, GIF and Photo Count

Photo Count

In [138]:
#Creating a photo count column...
df.insert(loc=23,
          column='Photo Count',
          value=df['media'])

In [139]:
#Counting number of Photos in media column
#No need to tokenize
def photo_count(text):
    text = str(text)
    text = text.count('Photo')
    return text

df['Photo Count'] = df['Photo Count'].apply(photo_count)


Video Count

In [140]:
#Creating a video count column...
df.insert(loc=24,
          column='Video Count',
          value=df['media'])

In [141]:
#We need to tokenize the media column so that we can count how many videos there are...
from nltk import word_tokenize

def tokenize(text):
    text = str(text)
    text = word_tokenize(text)
    return text

df['Video Count'] = df['Video Count'].apply(tokenize)

In [142]:
#Counting number of Videos in media column
def video_count(text):
    text = text.count('Video')
    return text

df['Video Count'] = df['Video Count'].apply(video_count)

Gif Count

In [143]:
#Creating a GIF count column...
df.insert(loc=25,
          column='GIF Count',
          value=df['media'])

In [144]:
#We need to tokenize the media column so that we can count how many GIFs there are...
from nltk import word_tokenize

def tokenize(text):
    text = str(text)
    text = word_tokenize(text)
    return text

df['GIF Count'] = df['GIF Count'].apply(tokenize)

In [145]:
#Counting number of GIFs in media column
def gif_count(text):
    text = text.count('Gif')
    return text

df['GIF Count'] = df['GIF Count'].apply(gif_count)

## Topic Modelling

In [146]:
df['Adjusted Tweet']

0        [lithuania, never, abandon, ukraine, volunteer...
1        [c, today, min, c, today, max, c, month, min, ...
2        [tough, talk, president, joe, biden, vladimir,...
3        [beach, nice, fall, smilingfacewithsmilingeyes...
4        [note, stepan, dob, leader, organization, ukra...
                               ...                        
60128    [announces, strengthen, armed, happened, ukrai...
60129    [living, govt, bomb, neighbouring, country, st...
60130    [alexander, mercouri, comment, ray, mcgoverns,...
60131    [nice, overview, capability, bradley, ifvs, ma...
60132    [vasyl, malyuk, head, sbu, secret, police, say...
Name: Adjusted Tweet, Length: 57862, dtype: object

In [147]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(x):
    return x

vectorizer = CountVectorizer(max_df = 0.9, min_df = 25, lowercase = False, tokenizer = dummy)
tf = vectorizer.fit_transform(df['Adjusted Tweet']).toarray()
tf_features_names = vectorizer.get_feature_names()



In [148]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 3

model = LatentDirichletAllocation(n_components=number_of_topics)

In [149]:
model.fit(tf)

LatentDirichletAllocation(n_components=3)

In [150]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [151]:
no_top_words = 10
display_topics(model, tf_features_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,ukraine,4874.3,ukraine,5368.5,russian,6692.7
1,war,4554.9,russia,3445.4,ukrainian,4481.3
2,amp,3651.4,russian,2597.3,ukraine,4230.2
3,u,3331.4,missile,1913.0,force,3588.8
4,people,2948.1,military,1476.6,war,1896.1
5,russia,2383.4,u,1377.5,putin,1776.9
6,help,1836.7,attack,1211.1,soldier,1736.4
7,world,1779.2,war,1138.0,troop,1461.3
8,like,1532.0,news,1093.3,russia,1376.2
9,support,1463.1,via,1048.1,region,1304.9


In [152]:
from sklearn.decomposition import NMF

model_2 = NMF(n_components=4, random_state=0, alpha=.1, l1_ratio=.5)

model_2.fit(tf)



NMF(alpha=0.1, l1_ratio=0.5, n_components=4, random_state=0)

In [153]:
display_topics(model_2, tf_features_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,ukraine,18.7,russian,15.0,war,15.7,russia,13.2
1,u,1.6,ukrainian,5.9,putin,1.4,amp,7.6
2,help,1.4,force,3.4,world,1.0,u,3.9
3,people,1.3,soldier,1.4,news,0.9,putin,1.7
4,please,1.1,military,1.4,end,0.8,military,1.2
5,link,1.1,troop,1.2,crime,0.8,country,1.2
6,follow,1.0,missile,1.1,one,0.6,say,1.1
7,article,1.0,region,1.1,video,0.6,people,1.1
8,force,0.8,army,1.0,day,0.5,world,1.1
9,day,0.7,attack,1.0,stop,0.5,weapon,0.8


Seems like these clustering algorithms do not pick up any interesting clusters

## TF-IDF 

In [154]:
my_list = df['Adjusted Tweet'].tolist()

In [155]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [156]:
def dummy(x):
    return x

vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 50, lowercase = False, tokenizer = dummy)
tf = vectorizer.fit_transform(df['Adjusted Tweet']).toarray()
tf_features_names = vectorizer.get_feature_names()



In [157]:
df_tf_idf = pd.DataFrame(tf, columns = tf_features_names)

df_tf_idf

Unnamed: 0,abandoned,ability,able,absolute,absolutely,accept,access,accident,according,account,...,youtube,yr,z,zaporizhzhia,zaporozhye,zelenskiy,zelensky,zelenskys,zelenskyy,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Creating our final Dataframe

Drop unnecessary features

In [None]:
final_df = df.drop(['media','Polarity Score','user_created','Date of Creation in months','Date of Tweet in Months',
         'hashtags', 'lang', 'mentionedUsers'], axis=1)

In [None]:
final_df.to_csv('../Data/Processed Dataset.csv', index = False)