## Preprocessing and Feature Extraction

In [116]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv("../Data/Ukraine Tweets.csv")

## Initial Preprocessing

Check for duplicates

In [109]:
df = df.drop_duplicates(subset = 'id')

Precounting of features: Length, Hashtags, URLs and Mentions

In [110]:
df['tweet_length'] = df['rendered_content'].apply(len)

In [111]:
df['num_mentions'] = df['rendered_content'].apply(lambda x: x.count('@'))

In [112]:
df['num_hashtags'] = df['rendered_content'].apply(lambda x: x.count('#'))

In [113]:
df['num_urls'] = df['rendered_content'].apply(lambda x: x.count('https'))

Steps taken:
- Converting emojis to text
- We decide to remove all the mentions and hashtagged words, as these will be analysed separately
- Remove Links, as these don't contribute to SA
- Conducting the SA on our preprocessed data

Creating a new column so that we can see the adjusted tweet and original versiom

In [114]:
df.insert(loc=6,
          column='Adjusted Tweet',
          value=df['rendered_content'])

Converting emojis to text

In [115]:
import emoji

def demote(text):
    text = emoji.demojize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(demote)

Removing mentions and hashtagged words

In [116]:
import re

def remove_mentions_hashtags(text):
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_mentions_hashtags)

Removing links

In [117]:
def remove_links(text):
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'bit.ly/\S+', '', text) # remove bitly links
    text = text.strip('[link]') # remove [links]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_links)

## Sentiment Analysis:

Insert Polarity Score Column

In [118]:
df.insert(loc=7,
          column='Polarity Score',
          value=df['Adjusted Tweet'])

Sentiment Analysis using NLTK's VADER

In [119]:
### Uncomment to download lexicon for the first time 
#import nltk
#nltk.download('vader_lexicon')

In [120]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def sentiment_analysis(text):  
    text = sia.polarity_scores(text)
    return text

df['Polarity Score'] = df['Polarity Score'].apply(sentiment_analysis)
    

Creating columns for:
- Negative Score
- Neutral Score
- Positive Score
- Compound Score [-1,1]

In [121]:
df.insert(loc=8,
          column='Negative Score',
          value=df['Polarity Score'])

df.insert(loc=9,
          column='Neutral Score',
          value=df['Polarity Score'])

df.insert(loc=10,
          column='Positive Score',
          value=df['Polarity Score'])

df.insert(loc=11,
          column='Compound Score',
          value=df['Polarity Score'])

df['Negative Score'] = df['Negative Score'].apply(lambda x: x['neg'])
df['Neutral Score'] = df['Neutral Score'].apply(lambda x: x['neu'])
df['Positive Score'] = df['Positive Score'].apply(lambda x: x['pos'])
df['Compound Score'] = df['Compound Score'].apply(lambda x: x['compound'])

In [122]:
df.head()

Unnamed: 0,id,date,user,user_followers,user_created,rendered_content,Adjusted Tweet,Polarity Score,Negative Score,Neutral Score,...,replies,quoteCount,hashtags,lang,media,mentionedUsers,tweet_length,num_mentions,num_hashtags,num_urls
0,1477420789863436289,2022-01-01 23:25:40+00:00,anno1540,8838,2014-06-12 17:05:22+00:00,"Lithuania will never abandon Ukraine, voluntee...","Lithuania will never abandon Ukraine, voluntee...","{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'comp...",0.0,0.661,...,0,0,"['Lithuania', 'Ukraine']",en,,,132,0,2,0
1,1477414596424220679,2022-01-01 23:01:03+00:00,weather_odessa,119,2019-07-10 08:34:22+00:00,#odessa #odesa #ukraine #–æ–¥–µ—Å—Å–∞\nNow: 4.2¬∞C\nT...,#–æ–¥–µ—Å—Å–∞\nNow: 4.2¬∞C\nToday's Min: 4.2¬∞C at ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,...,0,0,"['odessa', 'odesa', 'ukraine', '–æ–¥–µ—Å—Å–∞']",en,,,188,0,4,0
2,1477414332376010752,2022-01-01 23:00:00+00:00,AlArabiya_Eng,927174,2009-02-28 08:31:32+00:00,After tough talk between Presidents Joe Biden ...,After tough talk between Presidents Joe Biden ...,"{'neg': 0.099, 'neu': 0.776, 'pos': 0.125, 'co...",0.099,0.776,...,3,0,"['Russia', 'Ukraine']",en,,,277,0,2,0
3,1477409748572151809,2022-01-01 22:41:47+00:00,beatravelling,6329,2014-02-28 21:25:33+00:00,The beach can be nice in the fall too üòäüá∫üá¶\n\n#...,The beach can be nice in the fall too :smiling...,"{'neg': 0.0, 'neu': 0.781, 'pos': 0.219, 'comp...",0.0,0.781,...,0,0,"['lanzheron', 'langeron', 'beach', 'odessa', '...",en,,,122,0,5,0
4,1477409332820119552,2022-01-01 22:40:08+00:00,TornCurtain1991,677,2012-02-08 15:30:41+00:00,"A note: Stepan #Bandera, DOB 01011909, was lea...","A note: Stepan , DOB 01011909, was leader of O...","{'neg': 0.171, 'neu': 0.829, 'pos': 0.0, 'comp...",0.171,0.829,...,0,0,"['Bandera', 'Ukraine']",en,,,278,0,2,0


Sentiment Analysis using TextBlob

In [123]:
df.insert(loc=12,
          column='Polarity Score_textblob',
          value=df['Adjusted Tweet'])

In [124]:
df.insert(loc=13,
          column='Subjectivity Score_textblob',
          value=df['Adjusted Tweet'])

In [125]:
from textblob import TextBlob

#Create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Create a function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

df['Polarity Score_textblob'] = df['Polarity Score_textblob'].apply(getPolarity)
df['Subjectivity Score_textblob'] = df['Subjectivity Score_textblob'].apply(getSubjectivity)


## Further manipulating the tweet

Steps taken:
- Lowercase
- Punctuation
- Tokenization
- Stopword filtering
- Lemmatisation
- Number removal

Changing all text to lowercase

In [126]:
df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(lambda x: x.lower())

Removing all Punctuation

In [127]:
import string

def punctuation_remove(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(punctuation_remove)

Number removal

In [128]:
def remove_numbers(text):
    no_numbers = re.sub(r'\d+', '', text)
    return no_numbers

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_numbers)

Remove other non-ASCII characters

In [129]:
df['Adjusted Tweet'] = df['Adjusted Tweet'].str.replace(r'[^\x00-\x7F]+', '')

  df['Adjusted Tweet'] = df['Adjusted Tweet'].str.replace(r'[^\x00-\x7F]+', '')


Tokenizing

In [130]:
#nltk.download('punkt')

In [131]:
from nltk import word_tokenize

def tokenize(text):
    text = word_tokenize(text)
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(tokenize)

Stopword Filtering

In [132]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [133]:
def remove_stopwords(text):
    text = [word for word in text if word not in stop_words]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_stopwords)

Also remove words of a single letter ('a' and 'I' are already removed)

In [134]:
def remove_singles(text):
    text = [word for word in text if len(word) != 1]
    return text

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(remove_singles)

Lemmatisation

In [135]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def lemmatise(text): 
    lemma = [wnl.lemmatize(word) for word in text]
    return lemma

df['Adjusted Tweet'] = df['Adjusted Tweet'].apply(lemmatise)

## Months since creation of account relative to tweet

In [136]:
#Finding date of account creation in months
df.insert(loc=5,
          column='Date of Creation in months',
          value=df['user_created'])

In [137]:
from datetime import *

#returning the months of account creation
def account_creation(text):
    text = datetime.strptime(text, "%Y-%m-%d %H:%M:%S+00:00")
    year = str(text)[0:4]
    month = str(text)[5:7]
    total_months = (int(year)*12)+(int(month))
    return (total_months)

df['Date of Creation in months'] = df['Date of Creation in months'].apply(account_creation)

In [138]:
#Finding date of tweet in months
df.insert(loc=6,
          column='Date of Tweet in Months',
          value=df['date'])

In [139]:
#return the year of tweet
df['Date of Tweet in Months'] = df['Date of Tweet in Months'].apply(account_creation)

In [140]:
#Calculating Months since creation of account relative to tweet
df.insert(loc=7,
          column='Months Since Creation of Account',
          value= (df['Date of Tweet in Months']-df['Date of Creation in months']))

## Time of Day

In [141]:
#create a column for hours:
df.insert(loc=2,
          column='hour of tweet',
          value=df['date'])

In [142]:
from datetime import datetime

#return the hour of the tweet
def hour(text):
    text = datetime.strptime(text, "%Y-%m-%d %H:%M:%S+00:00")
    hour = str(text.time())[0:2]
    return int(hour)

df['hour of tweet'] = df['hour of tweet'].apply(hour)

In [143]:
#insert column for the time of day
df.insert(loc=3,
          column='time of day',
          value=df['hour of tweet'])

In [144]:
#calculating the time of day
def time_of_day(text):  
    if ((text >= 4) and (text < 8 )):
        return 'Early Morning'
    elif ((text >= 8) and (text < 12 )):
        return 'Morning'
    elif ((text >= 12) and (text < 16 )):
        return 'Noon'
    elif ((text >= 16) and (text < 20 )):
        return 'Eve'
    elif ((text >= 20) and (text < 24 )):
        return 'Night'
    elif ((text >= 0) and (text < 4 )):
        return 'Late Night'
    
df['time of day'] = df['time of day'].apply(time_of_day)


In [145]:
#Creating counts using one hot encoding

#Early Morning Count
df.insert(loc=4,
          column='Early Morning Count',
          value=df['time of day'])

def early_morning_count(text):
    if text == 'Early Morning':
        return 1
    else:
        return 0

df['Early Morning Count'] = df['Early Morning Count'].apply(early_morning_count)

#Morning Count
df.insert(loc=5,
          column='Morning Count',
          value=df['time of day'])

def morning_count(text):
    if text == 'Morning':
        return 1
    else:
        return 0

df['Morning Count'] = df['Morning Count'].apply(morning_count)

#Noon count
df.insert(loc=6,
          column='Noon Count',
          value=df['time of day'])

def noon_count(text):
    if text == 'Noon':
        return 1
    else:
        return 0

df['Noon Count'] = df['Noon Count'].apply(noon_count)

#Eve count
df.insert(loc=7,
          column='Eve Count',
          value=df['time of day'])

def eve_count(text):
    if text == 'Eve':
        return 1
    else:
        return 0

df['Eve Count'] = df['Eve Count'].apply(eve_count)

#Night count
df.insert(loc=8,
          column='Night Count',
          value=df['time of day'])

def night_count(text):
    if text == 'Night':
        return 1
    else:
        return 0

df['Night Count'] = df['Night Count'].apply(night_count)

#Late Night count
df.insert(loc=9,
          column='Late Night Count',
          value=df['time of day'])

def late_night_count(text):
    if text == 'Late Night':
        return 1
    else:
        return 0

df['Late Night Count'] = df['Late Night Count'].apply(late_night_count)

## Video, GIF and Photo Count

Photo Count

In [146]:
#Creating a photo count column...
df.insert(loc=23,
          column='Photo Count',
          value=df['media'])

In [147]:
#Counting number of Photos in media column
#No need to tokenize
def photo_count(text):
    text = str(text)
    text = text.count('Photo')
    return text

df['Photo Count'] = df['Photo Count'].apply(photo_count)


Video Count

In [148]:
#Creating a video count column...
df.insert(loc=24,
          column='Video Count',
          value=df['media'])

In [149]:
#We need to tokenize the media column so that we can count how many videos there are...
from nltk import word_tokenize

def tokenize(text):
    text = str(text)
    text = word_tokenize(text)
    return text

df['Video Count'] = df['Video Count'].apply(tokenize)

In [150]:
#Counting number of Videos in media column
def video_count(text):
    text = text.count('Video')
    return text

df['Video Count'] = df['Video Count'].apply(video_count)

Gif Count

In [151]:
#Creating a GIF count column...
df.insert(loc=25,
          column='GIF Count',
          value=df['media'])

In [152]:
#We need to tokenize the media column so that we can count how many GIFs there are...
from nltk import word_tokenize

def tokenize(text):
    text = str(text)
    text = word_tokenize(text)
    return text

df['GIF Count'] = df['GIF Count'].apply(tokenize)

In [153]:
#Counting number of GIFs in media column
def gif_count(text):
    text = text.count('Gif')
    return text

df['GIF Count'] = df['GIF Count'].apply(gif_count)

## Topic Modelling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(x):
    return x

vectorizer = CountVectorizer(max_df = 0.9, min_df = 25, lowercase = False, tokenizer = dummy)
tf = vectorizer.fit_transform(df['Adjusted Tweet']).toarray()
tf_features_names = vectorizer.get_feature_names()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 3

model = LatentDirichletAllocation(n_components=number_of_topics)

In [219]:
model.fit(tf)

LatentDirichletAllocation(n_components=3)

In [220]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [221]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights
0,#coinbase,2314.8,grid,8595.3,ours,7839.0
1,east,2071.3,membership,6647.4,membership,6827.7
2,national,1904.2,mi,4894.9,national,4885.2
3,membership,1602.9,agent,3823.2,grenades,4874.6
4,club,1589.0,grenades,2124.0,meet,3025.4
5,discovered,1411.6,crossing,1997.0,fascism,2587.8
6,occupying,1331.8,forbes,1947.9,out,2086.0
7,mi,1171.5,#freedomofrussia,1718.6,outcome,2026.2
8,live,1093.3,credit,1716.9,#coinbase,1979.2
9,announce,1014.8,important,1594.5,grid,1616.5


In [222]:
from sklearn.decomposition import NMF

model_2 = NMF(n_components=4, random_state=0, alpha=.1, l1_ratio=.5)

model_2.fit(tf)



NMF(alpha=0.1, l1_ratio=0.5, n_components=4, random_state=0)

In [223]:
display_topics(model_2, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,membership,19.1,ours,15.0,grid,15.2,national,13.1
1,meet,2.0,out,2.1,mi,6.2,grenades,8.3
2,grenades,1.9,outcome,2.0,agent,3.5,#coinbase,4.7
3,august,1.5,our,1.9,important,1.4,meet,2.3
4,east,1.4,fascism,1.1,credit,1.4,fascism,1.8
5,ended,1.2,#coinbase,0.9,manpads,1.2,occupying,1.5
6,coast,1.1,happening,0.5,crossing,1.2,democracy,1.0
7,african,1.1,east,0.5,forbes,1.1,#uvalde,1.0
8,#eurovision,1.0,#uvalde,0.5,#freedomofrussia,1.0,out,0.9
9,agent,0.8,meet,0.5,#estonia,1.0,outcome,0.9


Seems like these clustering algorithms do not pick up any interesting clusters

## Creating our final Dataframe

Drop unnecessary features

In [154]:
final_df = df.drop(['media','Polarity Score','user_created','Date of Creation in months','Date of Tweet in Months',
         'hashtags', 'lang', 'mentionedUsers'], axis=1)

In [155]:
final_df.head()

Unnamed: 0,id,date,hour of tweet,time of day,Early Morning Count,Morning Count,Noon Count,Eve Count,Night Count,Late Night Count,...,Polarity Score_textblob,Subjectivity Score_textblob,likes,retweets,replies,quoteCount,tweet_length,num_mentions,num_hashtags,num_urls
0,1477420789863436289,2022-01-01 23:25:40+00:00,23,Night,0,0,0,0,1,0,...,0.0,0.0,5,1,0,0,132,0,2,0
1,1477414596424220679,2022-01-01 23:01:03+00:00,23,Night,0,0,0,0,1,0,...,0.0,0.0,0,0,0,0,188,0,4,0
2,1477414332376010752,2022-01-01 23:00:00+00:00,23,Night,0,0,0,0,1,0,...,-0.194444,0.666667,4,0,3,0,277,0,2,0
3,1477409748572151809,2022-01-01 22:41:47+00:00,22,Night,0,0,0,0,1,0,...,0.6,1.0,0,0,0,0,122,0,5,0
4,1477409332820119552,2022-01-01 22:40:08+00:00,22,Night,0,0,0,0,1,0,...,-0.1,0.033333,1,2,0,0,278,0,2,0


In [156]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57862 entries, 0 to 60132
Data columns (total 32 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                57862 non-null  int64  
 1   date                              57862 non-null  object 
 2   hour of tweet                     57862 non-null  int64  
 3   time of day                       57862 non-null  object 
 4   Early Morning Count               57862 non-null  int64  
 5   Morning Count                     57862 non-null  int64  
 6   Noon Count                        57862 non-null  int64  
 7   Eve Count                         57862 non-null  int64  
 8   Night Count                       57862 non-null  int64  
 9   Late Night Count                  57862 non-null  int64  
 10  user                              57862 non-null  object 
 11  user_followers                    57862 non-null  int64  
 12  Mont

Final data cleaning:

In [252]:
final_df.to_csv('../Data/Processed Dataset.csv', index = False)