blog on data cleaning: https://monkeylearn.com/blog/text-cleaning/

In [1]:
# ! pip install flair
# ! pip install emoji
# ! pip install contextualSpellCheck

In [2]:
import pandas as pd

import re

# import string
import emoji

# import nltk
# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import stopwords
# # nltk.download('wordnet')
# # nltk.download('punkt')
# # nltk.download('average_perception_tagger')
# # nltk.download('wordnet')
# # nltk.download('words') # if its needed

# import spacy
# from spacy.tokenizer import Tokenizer
# from spacy.lang.en import English 
# nlp = English()

# from collections import Counter, defaultdict
# import matplotlib.pyplot as plt
# import numpy as np

In [1]:
import spacy
# ! python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [3]:
tweet_data = pd.read_csv('data.csv')
tweet_data.head()

Unnamed: 0,tweet_id,created_at,tweet,like_count,quote_count,reply_count,retweet_count
0,1551734038204923904,2022-07-26 00:59:59+00:00,$2.7 billion for climate change (slashing carb...,15,1,0,6
1,1551734021591269377,2022-07-26 00:59:55+00:00,@nathaliejacoby1 Climate change. The rise in t...,2,0,0,0
2,1551734013815029761,2022-07-26 00:59:53+00:00,@JacobsVegasLife @LasVegasLocally This is a ch...,8,0,1,0
3,1551733993740980224,2022-07-26 00:59:48+00:00,Climate Change and Energy Minister Chris Bowen...,18,0,8,5
4,1551733979316887554,2022-07-26 00:59:45+00:00,"@Thebs15800518 At 5:30, @SecGranHolm tries to ...",0,0,0,0


In [4]:
tweet_data = tweet_data.drop_duplicates()
tweet_data.shape

(146069, 7)

In [5]:
tweet_data = tweet_data[tweet_data['tweet']!='tweet']
tweet_data.shape

(146069, 7)

### Clean the data

In [6]:
df_clean = tweet_data.copy()

# make all tweet text lower case 
df_clean['clean'] = df_clean.tweet.str.lower()

# extract hashtags from tweet before removing punctuation 
df_clean['hashtags'] = df_clean.clean.apply(lambda x: [word for word in x.split(' ') if word.startswith('#')])

# remove new line 
df_clean.clean = df_clean.clean.apply(lambda x: re.sub(r'\n', ' ', x))

# remove mentions of other twitter users
df_clean.clean = df_clean.clean.apply(lambda x: re.sub(r'&amp\S+', '', x))

# Remove punctuation, URLS and @mentions
df_clean.clean = df_clean.clean.apply(lambda x: re.sub(r"(@\S+)|(#\S+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))

# remove excess white space 
df_clean.clean = df_clean.clean.apply(lambda x: x.strip())
df_clean.clean = df_clean.clean.apply(lambda x: ' '.join(x.split()))

# remove any emojis
df_clean.clean = df_clean.clean.apply(lambda x: emoji.replace_emoji(x, replace=''))

### Check for spelling errors

In [8]:
# import contextualSpellCheck
# # contextualSpellCheck.add_to_pipe(nlp)

# print(df_clean.iloc[2].tweet)
# doc = nlp(df_clean.iloc[2].clean)
# print('original: ', doc)
# print('corrected: ', doc._.outcome_spellCheck)

### Remove stop words and lemmatize

In [7]:
df_clean.clean.head()

0    27 billion for climate change slashing carbon ...
1    climate change the rise in temperature will be...
2    this is a chilling podcast about what could ha...
3    climate change and energy minister chris bowen...
4    at 530 tries to hide the fact that began signi...
Name: clean, dtype: object

The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form.
Lemmatisation is closely related to stemming. The difference is that a stemmer operates on a single word without knowledge of the context, and therefore cannot discriminate between words which have different meanings depending on part of speech. However, stemmers are typically easier to implement and run faster, and the reduced accuracy may not matter for some applications.

Remove stopwords

In [8]:
from spacy.lang.en import STOP_WORDS
print(df_clean.iloc[0]['clean'])
df_clean['clean_spacy'] = df_clean.clean.apply(lambda x: ' '.join([word for word in x.split(' ') if word not in STOP_WORDS]))
df_clean.iloc[0]['clean_spacy']

27 billion for climate change slashing carbon emissions 37 billion for cops tell me again why republicans arent in love with joe biden i mean at least when it comes to this


'27 billion climate change slashing carbon emissions 37 billion cops tell republicans arent love joe biden mean comes'

Lemmatize

In [13]:
' '.join(['text_to_join'])

'text_to_join'

In [9]:
print(df_clean.clean_spacy[0])
df_clean['clean_spacy'] = df_clean.clean_spacy.apply(lambda x: ' '.join([token.lemma_ for token in nlp(x) if not token.is_space]))
df_clean.clean_spacy[0]

27 billion climate change slashing carbon emissions 37 billion cops tell republicans arent love joe biden mean comes


'27 billion climate change slash carbon emission 37 billion cop tell republicans be not love joe biden mean come'

Remove stopwords again

In [10]:
print(df_clean.iloc[0]['clean_spacy'])
df_clean.clean_spacy = df_clean.clean_spacy.apply(lambda x: ' '.join([word for word in x.split(' ') if word not in STOP_WORDS]))
df_clean.iloc[0]['clean_spacy']

27 billion climate change slash carbon emission 37 billion cop tell republicans be not love joe biden mean come


'27 billion climate change slash carbon emission 37 billion cop tell republicans love joe biden mean come'

### Frequently used words

In [19]:
from collections import defaultdict
word_freq = defaultdict(int)
for sent in df_clean.clean_spacy:
    sent = sent.split(' ')
    for i in sent:
        word_freq[i] += 1
        
for word in sorted(word_freq, key=word_freq.get, reverse=True)[:20]:
    print(word, word_freq[word])

climate 138594
change 134530
people 12956
like 9826
s 9427
year 9109
world 8522
need 8429
global 7459
think 7360
cause 6973
know 6743
time 6265
bill 6070
real 5933
fight 5698
help 5467
new 5389
want 5374
right 5361


In [20]:
df_clean.to_csv('data_lg_clean.csv', index=False)