# Data Clean Module for Title & Content

## Imports

In [1]:
import unidecode 
import pandas as pd 
import re 
import time 
import nltk 
from nltk.corpus import stopwords 
nltk.download('stopwords') 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
from autocorrect import Speller 
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords 
from nltk import word_tokenize 
import string 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Clean Functions

In [2]:
def remove_newlines_tabs(text):
    """
    This function will remove all the occurrences of newlines, tabs, and combinations like: \\n, \\.
    
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" after removal of newlines, tabs, \\n, \\ characters.
        
    Example:
    Input : This is her \\ first day at this place.\n Please,\t Be nice to her.\\n
    Output : This is her first day at this place. Please, Be nice to her. 
    
    """
    
    # Replacing all the occurrences of \n,\\n,\t,\\ with a space.
    Formatted_text = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    return Formatted_text

In [3]:
def strip_html_tags(text):
    """ 
    This function will remove all the occurrences of html tags from the text.
    
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" after removal of html tags.
        
    Example:
    Input : This is a nice place to live. <IMG>
    Output : This is a nice place to live.  
    """
    # Initiating BeautifulSoup object soup.
    soup = BeautifulSoup(text, "html.parser")
    # Get all the text other than html tags.
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

In [4]:
def remove_links(text):
    """
    This function will remove all the occurrences of links.
    
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" after removal of all types of links.
        
    Example:
    Input : To know more about this website: kajalyadav.com  visit: https://kajalyadav.com//Blogs
    Output : To know more about this website: visit:     
    
    """
    
    # Removing all the occurrences of links that starts with https
    remove_https = re.sub(r'http\S+', '', text)
    # Remove all the occurrences of text that ends with .com
    remove_com = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
    # Remove all .com
    remove_com1 = re.sub(r"[A-Za-z]*.com", " ", remove_com)
    return remove_com

In [5]:
def remove_whitespace(text):
    """ This function will remove 
        extra whitespaces from the text
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" after extra whitespaces removed .
        
    Example:
    Input : How   are   you   doing   ?
    Output : How are you doing ?     
        
    """
    pattern = re.compile(r'\s+') 
    Without_whitespace = re.sub(pattern, ' ', text)
    # There are some instances where there is no space after '?' & ')', 
    # So I am replacing these with one space so that It will not consider two words as one token.
    text = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
    return text

In [6]:
# Code for accented characters removal
def accented_characters_removal(text):
    # this is a docstring
    """
    The function will remove accented characters from the 
    text contained within the Dataset.
       
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" with removed accented characters.
        
    Example:
    Input : Málaga, àéêöhello
    Output : Malaga, aeeohello    
        
    """
    # Remove accented characters from text using unidecode.
    # Unidecode() - It takes unicode data & tries to represent it to ASCII characters. 
    text = unidecode.unidecode(text)
    return text

In [7]:
# Code for text lowercasing
def lower_casing_text(text):
    
    """
    The function will convert text into lower case.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
         value: text in lowercase
         
    Example:
    Input : The World is Full of Surprises!
    Output : the world is full of surprises!
    
    """
    # Convert text to lower case
    # lower() - It converts all upperase letter of given string to lowercase.
    text = text.lower()
    return text

In [8]:
# Code for removing repeated characters and punctuations

def reducing_incorrect_character_repeatation(text):
    """
    This Function will reduce repeatition to two characters 
    for alphabets and to one character for punctuations.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Finally formatted text with alphabets repeating to 
        two characters & punctuations limited to one repeatition 
        
    Example:
    Input : Realllllllllyyyyy,        Greeeeaaaatttt   !!!!?....;;;;:)
    Output : Reallyy, Greeaatt !?.;:)
    
    """
    # Pattern matching for all case alphabets
    Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)
    
    # Limiting all the  repeatation to two characters.
    Formatted_text = Pattern_alpha.sub(r"\1\1", text) 
    
    # Pattern matching for all the punctuations that can occur
    Pattern_Punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
    
    # Limiting punctuations in previously formatted string to only one.
    Combined_Formatted = Pattern_Punct.sub(r'\1', Formatted_text)
    
    # The below statement is replacing repeatation of spaces that occur more than two times with that of one occurrence.
    Final_Formatted = re.sub(' {2,}',' ', Combined_Formatted)
    return Final_Formatted

In [9]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
}
# The code for expanding contraction words
def expand_contractions(text, contraction_mapping =  CONTRACTION_MAP):
    """expand shortened words to the actual form.
       e.g. don't to do not
    
       arguments:
            input_text: "text" of type "String".
         
       return:
            value: Text with expanded form of shorthened words.
        
       Example: 
       Input : ain't, aren't, can't, cause, can't've
       Output :  is not, are not, cannot, because, cannot have 
    
     """
    # Tokenizing text into tokens.
    list_Of_tokens = text.split(' ')

    # Checking for whether the given token matches with the Key & replacing word with key's value.
    
    # Check whether Word is in lidt_Of_tokens or not.
    for Word in list_Of_tokens: 
        # Check whether found word is in dictionary "Contraction Map" or not as a key. 
         if Word in CONTRACTION_MAP: 
                # If Word is present in both dictionary & list_Of_tokens, replace that word with the key value.
                list_Of_tokens = [item.replace(Word, CONTRACTION_MAP[Word]) for item in list_Of_tokens]
                
    # Converting list of tokens to String.
    String_Of_tokens = ' '.join(str(e) for e in list_Of_tokens) 
    return String_Of_tokens

In [10]:
# The code for removing special characters
def removing_special_characters(text):
    """Removing all the special characters except the one that is passed within 
       the regex to match, as they have imp meaning in the text provided.
   
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text with removed special characters that don't require.
        
    Example: 
    Input : Hello, K-a-j-a-l. Thi*s is $100.05 : the payment that you will recieve! (Is this okay?) 
    Output :  Hello, Kajal. This is $100.05 : the payment that you will recieve! Is this okay?
    
   """
    # The formatted text after removing not necessary punctuations.
    # Formatted_Text = re.sub(r"[^a-zA-Z0-9:$-,%.?!]+", ' ', text) 
    # In the above regex expression,I am providing necessary set of punctuations that are frequent in this particular dataset.
    Formatted_Text = re.sub(r"[^a-zA-Z0-9:$-,%.?!]+", ' ', text)
    return Formatted_Text

In [11]:
# The code for removing stopwords
stoplist = stopwords.words('english') 
stoplist = set(stoplist)
def removing_stopwords(text):
    """This function will remove stopwords which doesn't add much meaning to a sentence 
       & they can be remove safely without comprimising meaning of the sentence.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text after omitted all stopwords.
        
    Example: 
    Input : This is Kajal from delhi who came here to study.
    Output : ["'This", 'Kajal', 'delhi', 'came', 'study', '.', "'"] 
    
   """
    # repr() function actually gives the precise information about the string
    text = repr(text)
    # Text without stopwords
    No_StopWords = [word for word in word_tokenize(text) if word.lower() not in stoplist ]
    # Convert list of tokens_without_stopwords to String type.
    words_string = ' '.join(No_StopWords)    
    return words_string

In [12]:
# The code for spelling corrections
def spelling_correction(text):
    ''' 
    This function will correct spellings.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text after corrected spellings.
        
    Example: 
    Input : This is Oberois from Dlhi who came heree to studdy.
    Output : This is Oberoi from Delhi who came here to study.
      
    
    '''
    # Check for spellings in English language
    spell = Speller(lang='en')
    Corrected_text = spell(text)
    return Corrected_text

In [13]:
# The code for spelling corrections
def spelling_correction(text):
    ''' 
    This function will correct spellings.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text after corrected spellings.
        
    Example: 
    Input : This is Oberois from Dlhi who came heree to studdy.
    Output : This is Oberoi from Delhi who came here to study.
      
    
    '''
    # Check for spellings in English language
    spell = Speller(lang='en')
    Corrected_text = spell(text)
    return Corrected_text

In [14]:
# The code for lemmatization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatization(text):
    """This function converts word to their root words 
       without explicitely cut down as done in stemming.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text having root words only, no tense form, no plural forms
        
    Example: 
    Input : text reduced 
    Output :  text reduce
    
   """
    # Converting words to their root forms
    lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]
    return lemma

In [15]:
# Writing main function to merge all the preprocessing steps.
def text_preprocessing(text, accented_chars=True, contractions=False, lemmatization = False,
                        extra_whitespace=True, newlines_tabs=True, repeatition=True, 
                       lowercase=False, punctuations=True, mis_spell=False,
                       remove_html=True, links=True,  special_chars=True,
                       stop_words=False):
    """
    This function will preprocess input text and return
    the clean text.
    """
        
    if newlines_tabs == True: #remove newlines & tabs.
        Data = remove_newlines_tabs(text)
        
    if remove_html == True: #remove html tags
        Data = strip_html_tags(Data)
        
    if links == True: #remove links
        Data = remove_links(Data)
        
    if extra_whitespace == True: #remove extra whitespaces
        Data = remove_whitespace(Data)
        
    if accented_chars == True: #remove accented characters
        Data = accented_characters_removal(Data)
        
    if lowercase == True: #convert all characters to lowercase
        Data = lower_casing_text(Data)
        
    if repeatition == True: #Reduce repeatitions   
        Data = reducing_incorrect_character_repeatation(Data)
        
    if contractions == True: #expand contractions
        Data = expand_contractions(Data)
    
    if punctuations == True: #remove punctuations
        Data = removing_special_characters(Data)
    
    stoplist = stopwords.words('english') 
    stoplist = set(stoplist)
    
    if stop_words == True: #Remove stopwords
        Data = removing_stopwords(Data)
        
    spell = Speller(lang='en')
    
    if mis_spell == True: #Check for mis-spelled words & correct them.
        Data = spelling_correction(Data)
        
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
     
    if lemmatization == True: #Converts words to lemma form.
        Data = lemmatization(Data)
    
           
    return Data

## Read and Clean Content Data

In [16]:
# Read Content Dataset 
df = pd.read_csv("D://NLP//Frame_NLP//archive//coronavirus_content.txt", index_col = False, sep='delimiter', on_bad_lines='skip', header=None, encoding = 'utf8')
#DF = pd.read_csv("D://NLP//Frame_NLP//archive//covid19_content.txt", index_col = False, sep='delimiter', encoding = 'utf-8')
# Show Dataset
df.shape

  return func(*args, **kwargs)


(16317, 1)

In [17]:
df_all = df.dropna(axis=0, how='any', inplace=False)
df_all
df_all.reset_index(drop=True)

Unnamed: 0,0
0,Photo illustration by Slate. Photo by Photo il...
1,"In the U.S., COVID-19 is spreading like wildfi..."
2,Researchers and medical practitioners have spe...
3,It won’t be. The updated message that needs to...
4,A big part of the challenges around messaging ...
...,...
16312,CHISINAU (UrduPoint News / Sputnik - 23rd Marc...
16313,The first two deaths caused by the coronavirus...
16314,"""The third Romanian died on the evening of Sun..."
16315,The latest update from the Romanian government...


In [18]:
# Pre-processing for Content
List_Content = df_all[0].to_list()
Final_Article = []
Complete_Content = []
for article in List_Content:
    Processed_Content = text_preprocessing(article) #Cleaned text of Content attribute after pre-processing
    Final_Article.append(Processed_Content)
Complete_Content.extend(Final_Article)
df_all['Processed_Content'] = Complete_Content





In [19]:
Complete_Content[5]



In [20]:
df_all.head()

Unnamed: 0,0,Processed_Content
0,Photo illustration by Slate. Photo by Photo il...,Photo illustration by Slate. Photo by Photo il...
1,"In the U.S., COVID-19 is spreading like wildfi...","In the U.S., COVID 19 is spreading like wildfi..."
2,Researchers and medical practitioners have spe...,Researchers and medical practitioners have spe...
3,It won’t be. The updated message that needs to...,It won't be. The updated message that needs to...
4,A big part of the challenges around messaging ...,A big part of the challenges around messaging ...


In [21]:
df_all.drop([0], axis=1, inplace=True)

In [22]:
df_all.shape

(16317, 1)

In [23]:
df_all.drop_duplicates(keep = False, inplace = True)
df_all.shape

(12224, 1)

In [None]:
spec_chars = ["#","@","…","\xa0","\n\n","\n","\\"]
for char in spec_chars:
    df_all = df_all.str.replace(char, '')

In [24]:
df_all.to_csv('D://NLP//Frame_NLP//archive//coronavirus_content.csv', index= False)

## Content Data to Sentences

In [26]:
import spacy


spacy.require_gpu()
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2500000
from spacy.matcher import PhraseMatcher

phrase_matcher = PhraseMatcher(nlp.vocab)
phrases = ['covid19','COVID-19','Covid19','COVID19','coronavirus','covid']
patterns = [nlp(text) for text in phrases]
phrase_matcher.add('covid19', None, *patterns)
with open("D://NLP//Frame_NLP//archive//coronavirus_content.csv", encoding='utf8') as f:
    text = f.read().replace("\n\n", " ").replace("\n", " ")
doc = nlp(text)

with open("D://NLP//Frame_NLP//archive//coronavirus_content_sent.txt", 'w', encoding='utf8') as f:
    for sent in doc.sents:
        for match_id, start, end in phrase_matcher(nlp(sent.text)):
            if nlp.vocab.strings[match_id] in ["covid19"]:
                print(sent.text, file=f)

In [27]:
content = open('D://NLP//Frame_NLP//archive//coronavirus_content_sent.txt', 'r', encoding='utf-8').readlines()
content_set = set(content)
cleandata = open('D://NLP//Frame_NLP//archive//coronavirus_content_sent.txt', 'w', encoding='utf-8')
for line in content_set:
    cleandata.write(line)

In [28]:
df_sent = pd.read_csv("D://NLP//Frame_NLP//archive//coronavirus_content_sent.txt", index_col = False, sep='delimiter', on_bad_lines='skip', header=None, encoding = 'utf8')
df_sent.shape

  return func(*args, **kwargs)


(4328, 1)

In [29]:
# Pre-processing for Content
List_Content = df_sent[0].to_list()
Final_Article = []
Complete_Content = []
for article in List_Content:
    Processed_Content = text_preprocessing(article) #Cleaned text of Content attribute after pre-processing
    Final_Article.append(Processed_Content)
Complete_Content.extend(Final_Article)
df_sent['Processed_Content'] = Complete_Content

In [30]:
df_sent.drop([0], axis=1, inplace=True)
df_sent.drop_duplicates(keep = False, inplace = True)
df_sent.shape

(4318, 1)

In [31]:
df_sent.to_csv('D://NLP//Frame_NLP//archive//coronavirus19_content.csv', index= False)

## Read and Clean Title Data

In [32]:
# Read Title Dataset 
df = pd.read_csv("D://NLP//Frame_NLP//archive//coronavirus_title.txt", index_col = False, sep='delimiter', on_bad_lines='skip', header=None, encoding = 'utf8')
#DF = pd.read_csv("D://NLP//Frame_NLP//archive//covid19_title.txt", index_col = False, sep='delimiter', encoding = 'utf-8')
# Show Dataset
df.shape

(10000, 1)

In [33]:
df_all = df.dropna(axis=0, how='any', inplace=False)
df_all
df_all.reset_index(drop=True)

Unnamed: 0,0
0,The Coronavirus Is Airborne. The Coronavirus I...
1,China Coronavirus Outbreak: How Is Coronavirus...
2,Coronavirus In Pennsylvania: Fayette County Re...
3,Coronavirus Outbreak: Busted! Top Myths About ...
4,Coronavirus cases in UK confirmed: What are th...
...,...
9995,Biden says he hasn’t been tested for coronavirus
9996,Germany easing border restrictions as coronavi...
9997,Matt Damon's Oldest Daughter Has Recovered Fro...
9998,Paul Manafort released from prison due to coro...


In [34]:
# Pre-processing for Title
List_Title = df_all[0].to_list()

Final_Title = []
Complete_Title = []
for title in List_Title:
    Processed_Title = text_preprocessing(title) #Cleaned text of Title attribute after pre-processing
    Final_Title.append(Processed_Title)
Complete_Title.extend(Final_Title)
df_all['Processed_Title'] = Complete_Title 

In [35]:
Complete_Title[:10]

['The Coronavirus Is Airborne. The Coronavirus Is Airborne. The Coronavirus Is Airborne.',
 'China Coronavirus Outbreak: How Is Coronavirus Treated ? ',
 'Coronavirus In Pennsylvania: Fayette County Reports First Case Of Coronavirus',
 'Coronavirus Outbreak: Busted! Top Myths About Coronavirus',
 'Coronavirus cases in UK confirmed: What are the symptoms of coronavirus ? ',
 'Is it COVID 19 or covid 19 or coronavirus or coronavirus disease ? ',
 'Coronavirus updates: Coronavirus infections in U.S. top 500',
 'Coronavirus outbreak: Map tracks Wuhan coronavirus spread',
 'Coronavirus Russia deaths: How many coronavirus cases in Russia ? ',
 "Coronavirus pandemic China's coronavirus cases drop to one"]

In [36]:
df_all.head()

Unnamed: 0,0,Processed_Title
0,The Coronavirus Is Airborne. The Coronavirus I...,The Coronavirus Is Airborne. The Coronavirus I...
1,China Coronavirus Outbreak: How Is Coronavirus...,China Coronavirus Outbreak: How Is Coronavirus...
2,Coronavirus In Pennsylvania: Fayette County Re...,Coronavirus In Pennsylvania: Fayette County Re...
3,Coronavirus Outbreak: Busted! Top Myths About ...,Coronavirus Outbreak: Busted! Top Myths About ...
4,Coronavirus cases in UK confirmed: What are th...,Coronavirus cases in UK confirmed: What are th...


In [37]:
df_all.drop([0], axis=1, inplace=True)
df_all.shape

(10000, 1)

In [38]:
df_all.drop_duplicates(keep = False, inplace = True)
df_all.shape

(8376, 1)

In [39]:
df_all.to_csv('D://NLP//Frame_NLP//archive//coronavirus_title.csv', index= False)