In [2]:
#Important packages to clean and preprocess texts
import re 
import string
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 


In [None]:
#Removing numbers
#Using regex (i.e. regular expressions)
text_1 = "The weather is exceptionally nice today. Except for the fact that on Thursday, we will have 5 thunderstorms. Oh well, but who cares?"
text_clean = re.sub(r"\d+", "",text_1)
print(text_clean)

In [None]:
#Remove white space
text_1 = " \t I love Corgis. They are the     cutest "
text_clean = text_1.strip()
print(text_clean)

In [None]:
#Separating Sentences with Split () Method
sent = "The weather is exceptionally nice today. Except for the fact that on Thursday, we will have 5 thunderstorms. Oh well, but who cares?"
sent_split = sent.split()
print(sent_split)

In [4]:
#Tokenizing words in sentences with NLTK
sent = "The weather is exceptionally nice today. Except for the fact that on Thursday, we will have 5 thunderstorms. Oh well, but who cares?"

tokens = nltk.tokenize.word_tokenize(sent)
print(tokens)

['The', 'weather', 'is', 'exceptionally', 'nice', 'today', '.', 'Except', 'for', 'the', 'fact', 'that', 'on', 'Thursday', ',', 'we', 'will', 'have', '5', 'thunderstorms', '.', 'Oh', 'well', ',', 'but', 'who', 'cares', '?']


In [None]:
# Removing stopwords
sent = "The weather is exceptionally nice today. Except for the fact that on Thursday, we will have 5 thunderstorms. Oh well, but who cares? I think the best kind of weather is an unpredictable one like this one we have in Hamburg."
sent = sent.translate(str.maketrans('','',string.punctuation)).lower()
 
tokens = word_tokenize(sent)
listStopword =  set(stopwords.words('english'))
 
removed = []
for t in tokens:
    if t not in listStopword:
        removed.append(t)
 
print(removed)

In [None]:
#Porter stemmer can easily be used as follows
ps = PorterStemmer() 

word_program = ["program", "programs", "programer", "programing", "programers"] 
  
for k in word_program: 
    print(k, " : ", ps.stem(k))

You can find more information on the official website for Porter stemmer algorithm here:
https://tartarus.org/martin/PorterStemmer/index-old.html

# **PRE-PROCESSING ON REAL DATA SET**

We will model the approach on the **Covid-19 Twitter dataset**. Our main task is to clean and filer out English tweets only.

The original dataset is by Preda, G. (2020, August 30). COVID19 Tweets. Retrieved from https://www.kaggle.com/gpreda/covid19-tweets

There are 3 major components to this approach:

1.  Clean and filter all non-English tweets/texts as we want consistency in the data.
 
2.   Create a simplified version for our complex text data.

3.   Vectorize the text and save their embedding for future analysis.







### **PART 0: Loading packages and the dataset**

In [1]:
#Import necessery libraries for future analysis of the dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns
np.random.seed(2020)
import nltk
nltk.download('punkt') # one time execution
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data = pd.read_csv("covid19_tweets.csv")
data.head(3)

In [None]:

data.shape

In [None]:

data.dtypes

In [None]:
data.info()

### **PART 1:Data cleaning**

In [None]:
#Clean and standardize text to English-only tweets
#This function will remove all non-English characters
# no non-english
def clean_non_english(txt): 
    try: 
        txt = re.sub(r'\W+', ' ', txt)
        txt = txt.lower()
        txt = txt.replace("[^a-zA-Z]", " ")
        word_tokens = word_tokenize(txt) 
        filtered_word = [w for w in word_tokens if all(ord(c) < 128 for c in w)]
        filtered_word = [w + " " for w in filtered_word]
        return "".join(filtered_word)
    except:
        return np.nan

data["english_text"] = data.text.apply(clean_non_english)

In [None]:
# all clean
def clean_text(english_txt): 
    try: 
        word_tokens = word_tokenize(english_txt)
        filtered_word = [w for w in word_tokens if not w in stop_words] 
        filtered_word = [w + " " for w in filtered_word]
        return "".join(filtered_word)
    except:
        return np.nan

data["cleaned_text"] = data.english_text.apply(clean_text)

In [None]:
data.isnull().sum()

We can even do better by removing the stopwords. 

Stopwords are common words that appear in English sentences without contributing much to the meaning. We will use the nltk package to filter the stopwords. 

As our main task is visualizing the common theme of tweets using word cloud, this step is necessary to avoid common words like “*the*,” “*a*,”,“*is*”, etc.

However, if your tasks require full sentence structure, like next word prediction or grammar check, you can skip this step.

In [None]:
import nltk
nltk.download('punkt') # one time execution
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def clean_text(english_txt):
    try:
       word_tokens = word_tokenize(english_txt)
       filtered_word = [w for w in word_tokens if not w in stop_words]
       filtered_word = [w + " " for w in filtered_word]
       return "".join(filtered_word)
    except:
       return np.nan

For tweets, there is a special feature we need to consider before cleaning: mentions. Your data might have special features like this (or not). Therefore, this is case-by-case and NOT a universal requirement.


In [None]:
def get_mention(txt):
    mention = []
    for i in txt.split(" "):
        if len(i) > 0 and i[0] == "@":
            mention.append(i)
    return "".join([mention[i] + ", " if i != len(mention) - 1 else mention[i] for i in range(len(mention))])

data["mention"] = data.text.apply(get_mention)

In [None]:
def remove_link_email(txt):
    txt = txt.replace("...", "")
    txt = re.sub(r"http\S+", "", txt)
    txt = txt.replace('\S*@\S*\s?', "")
    txt = re.sub(r'[^\w\s]', '', txt)
    return txt

data.text = data.text.apply(remove_link_email)

We then filter out all columns then is not *“en”* language.


In [None]:
def clean_tag(txt):
    try:
        chars = "'[]"
        for char in chars:
            txt = txt.replace(char, "")
        txt = txt.lower()
        return txt
    except: 
        return np.nan

data["cleaned_tags"] = data.hashtags.apply(clean_tag)

In [None]:

covid_list = []
for item in data.cleaned_tags:
    try:
        if item != np.nan:
            covid_list_word = item.split(", ")
        covid_list += covid_list_word
    except:
        pass

from collections import Counter
x = Counter(covid_list)
x.most_common(10)

In [None]:
def get_len_hashtag(txt):
    try: 
        return len(txt.split(","))
    except:
        return np.nan

data["len_hashtag"] = data.hashtags.apply(get_len_hashtag)

In [None]:

data.dropna(subset=["user_description", "user_location", "hashtags", "cleaned_text", "text", "english_text", 'cleaned_tags'], inplace=True)

In [None]:
data.head(5)

In [None]:

data = data.sample(n = 10000)
data.reset_index(inplace=True)
data.drop(['index', 'source'], axis = 1, inplace=True)
print(data.shape)
data.head(3)

In [None]:
data.isnull().sum()


Before, we clean the non-English characters. Now, we remove the non-English texts (semantically). Langdetect is a python package that allows for checking the language of the text. It is a direct port of Google’s language detection library from Java to Python.

In [None]:
from langdetect import detect
def detect_lang(txt):
    try:
        return detect(txt)
    except:
        return np.nan

In [None]:
new_data = data[data.language == "en"]
new_data.reset_index(inplace = True)

In [None]:

new_data.head(3)

In [None]:
new_data.drop(['index'], axis=1, inplace=True)

In [None]:
new_data.to_csv('english_tweets.csv')