### Loading libraries

In [1]:
%%capture
!pip install num2words
!pip install -U spacy
!python3 -m spacy download en_core_web_md

In [1]:
%%capture
from word2number import w2n

import pandas as pd

# Spacy
import spacy
nlp=spacy.load('en_core_web_sm')

# inflect
import inflect
inflect = inflect.engine()

#w2n.word_to_num('hey was geht')
from num2words import num2words

#for spell correction
from textblob import TextBlob

#for creating vecotorization
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

#for plotting
import matplotlib.pyplot as plt
import plotly.express as px

#for removing stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords

#for drawing graphs
import networkx as nx

#for nlp
from nltk.corpus import wordnet as guru
from nltk.corpus import wordnet
syns = wordnet.synsets("dog")
print(syns)

#for text manipulation
import regex as re

#avoid unneccessary output
import warnings
warnings.filterwarnings('ignore')

## Preprocessing

#### Read in Data

In [2]:
df = pd.read_csv('./data/sample_data.csv')

In [3]:
df.head(2).style

Unnamed: 0,URL,datetime,heading,text
0,https://www.scmp.com/news/hong-kong/education/article/3195500/pity-children-hong-kong-counts-cost-months-virtual-lessons,2022-10-11T00:00:11.000Z,"Pity the children: Hong Kong counts the cost of months of virtual lessons amid the Covid pandemic, with no time to play, make friends","Younger children struggling to adjust to school life, some teens have anxiety, emotional issuesExperts concerned about youngsters’ greater use of electronic devices, rise in child suicides too"
1,https://www.scmp.com/news/hong-kong/politics/article/3195492/hong-kong-protests-young-people-jailed-over-unrest-should,2022-10-10T23:30:10.000Z,"Hong Kong protests: young people jailed over unrest should be allowed to contribute to society on release, rehabilitation group says","Protesters jailed over 2019 anti-government protests should be used to help beat brain drain and be given more support to continue education behind bars, group saysProject Change appeals to professional bodies to relax rules on members and applicants who had fallen foul of the law"


#### correct spelling

to identify words with same meaning

In [4]:
def correct_spelling(text):
    
    #correct any spelling mistakes
    text = TextBlob(text)
    text = str(text.correct())

    #the spell correction works but takes a lot of runtime
    #therefore it is commented out at the momement
    
    # write words out
    # first making sure we get all variation of apostrophes
    text = re.sub(r"’", "'", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'t", " not ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'m", " am ", text)
    
    return text

df['processed_text'] = df['text'].apply(lambda x: correct_spelling(x))

In [5]:
df.head(3).style

Unnamed: 0,URL,datetime,heading,text,processed_text
0,https://www.scmp.com/news/hong-kong/education/article/3195500/pity-children-hong-kong-counts-cost-months-virtual-lessons,2022-10-11T00:00:11.000Z,"Pity the children: Hong Kong counts the cost of months of virtual lessons amid the Covid pandemic, with no time to play, make friends","Younger children struggling to adjust to school life, some teens have anxiety, emotional issuesExperts concerned about youngsters’ greater use of electronic devices, rise in child suicides too","Younger children struggling to adjust to school life, some tens have anxiety, emotional issuesExperts concerned about youngsters' greater use of electronic devices, rise in child suicide too"
1,https://www.scmp.com/news/hong-kong/politics/article/3195492/hong-kong-protests-young-people-jailed-over-unrest-should,2022-10-10T23:30:10.000Z,"Hong Kong protests: young people jailed over unrest should be allowed to contribute to society on release, rehabilitation group says","Protesters jailed over 2019 anti-government protests should be used to help beat brain drain and be given more support to continue education behind bars, group saysProject Change appeals to professional bodies to relax rules on members and applicants who had fallen foul of the law","Protesters jailed over 2019 anti-government protests should be used to help beat brain drain and be given more support to continue education behind bars, group saysProject Change appeals to professional bodies to relax rules on members and applicant who had fallen foul of the law"
2,https://www.scmp.com/news/hong-kong/politics/article/3195498/russian-consulate-hong-kong-given-advanced-notice-arrival,2022-10-10T15:40:48.000Z,Russian consulate in Hong Kong given advanced notice for arrival of superyacht linked to sanctioned billionaire,"Russia’s top envoy in city says he ‘fully agreed’ with decision by Hong Kong Marine Department to refrain from implementing sanctions issued by third countryArrival of the superyacht has led to a diplomatic row between the US and Beijing, with former saying Hong Kong’s reputation is at stake","Russia is top envoy in city says he ‘fully agreed' with decision by Long Long Marine Department to refrain from implementing sanction issued by third countryArrival of the superyacht has led to a diplomatic row between the of and Seizing, with former saying Long Long is reputation is at stake"


#### remove punctuation and other irritating parts


In [6]:
# Pre-processing step `remove special chars`
def remove_special_characters_and_parts(text):

    #replace all words that just contain one letter
    text = re.sub(r'(?:^| )\w(?:$| )', ' ', str(text)).strip()

    #remove most puntuation
    text = re.sub('([()‘’“”…+\-\{\}])+', ' ', text)
    text = re.sub(r"\\'+",' ', text)
    text = re.sub(r'\\"+',' ', text)
    text = re.sub(r'\\^+',' ', text)
    text = re.sub(r'\.+',' ', text)
    text = re.sub(r'\;+',' ', text)
    text = re.sub(r'\!+',' ', text)
    text = re.sub(r'\,+',' ', text)
    text = re.sub(r'\?+',' ', text)
    text = re.sub(r'\'+',' ', text)
    text = re.sub(r'\"+',' ', text)
    text = re.sub(r'\'+',' ', text)
    
    #remove slashes
    text = re.sub('\n', ' ', text)
    text = re.sub(r'[\\(/)]',' ', text)
    text = re.sub(r'[\\(\)]',' ', text)

    #get rid of and as a sign
    text = re.sub(r'\&',' and ', text)

    #remove unnecessary whitespaces
    text = re.sub(r'\s+', ' ', text)

    return text

df['processed_text'] = df['processed_text'].apply(lambda x: remove_special_characters_and_parts(x))

In [7]:
df.head(3).style

Unnamed: 0,URL,datetime,heading,text,processed_text
0,https://www.scmp.com/news/hong-kong/education/article/3195500/pity-children-hong-kong-counts-cost-months-virtual-lessons,2022-10-11T00:00:11.000Z,"Pity the children: Hong Kong counts the cost of months of virtual lessons amid the Covid pandemic, with no time to play, make friends","Younger children struggling to adjust to school life, some teens have anxiety, emotional issuesExperts concerned about youngsters’ greater use of electronic devices, rise in child suicides too",Younger children struggling to adjust to school life some tens have anxiety emotional issuesExperts concerned about youngsters greater use of electronic devices rise in child suicide too
1,https://www.scmp.com/news/hong-kong/politics/article/3195492/hong-kong-protests-young-people-jailed-over-unrest-should,2022-10-10T23:30:10.000Z,"Hong Kong protests: young people jailed over unrest should be allowed to contribute to society on release, rehabilitation group says","Protesters jailed over 2019 anti-government protests should be used to help beat brain drain and be given more support to continue education behind bars, group saysProject Change appeals to professional bodies to relax rules on members and applicants who had fallen foul of the law",Protesters jailed over 2019 anti government protests should be used to help beat brain drain and be given more support to continue education behind bars group saysProject Change appeals to professional bodies to relax rules on members and applicant who had fallen foul of the law
2,https://www.scmp.com/news/hong-kong/politics/article/3195498/russian-consulate-hong-kong-given-advanced-notice-arrival,2022-10-10T15:40:48.000Z,Russian consulate in Hong Kong given advanced notice for arrival of superyacht linked to sanctioned billionaire,"Russia’s top envoy in city says he ‘fully agreed’ with decision by Hong Kong Marine Department to refrain from implementing sanctions issued by third countryArrival of the superyacht has led to a diplomatic row between the US and Beijing, with former saying Hong Kong’s reputation is at stake",Russia is top envoy in city says he fully agreed with decision by Long Long Marine Department to refrain from implementing sanction issued by third countryArrival of the superyacht has led to diplomatic row between the of and Seizing with former saying Long Long is reputation is at stake


#### normalize, text folding & first stopword removal

to identify words with same meaning

In [8]:
from collections import Counter
common_letters = Counter("Autobaby") & Counter("Baby") 
sum(common_letters.values())
len("Baby")

4

In [9]:
def normalize_text(text):
    tokenized_text = nlp(str(text).lstrip().rstrip())
    text = ""

    for token in tokenized_text:
        
          if (token.tag_ == "NN" or token.tag_ == "NE"):
            # Check if the noun is already singular
            if inflect.singular_noun(str(token)) == False:
                text+=" "+str(token).lower()
            else:  

                # Step 2
                # singularize a plural noun
                try:
                    singularized_text = inflect.plural(str(token).lower())
                    text+=" "+singularized_text.lower()

                except IndexError:
                    print(str(token))
                    text+=" "+str(token).lower()
                    pass   
          else:
            
            #filter out stopword based on part of speeach tagging
            # see https://machinelearningknowledge.ai/tutorial-on-spacy-part-of-speech-pos-tagging/
            if (token.pos_ == "DET" or token.pos_ == "ADP" or 
                token.pos_ == "ADP" or token.pos_ == "CONJ" or
                token.pos_ == "PRON"):
                pass
                
            else:
            
                #lemmatization sometimes leads to bad results as result
                #we make a check for common letter which should be at least the length of the original string -1
                #else we are not doing the lemmatization
                common_letters = Counter(str(token).lower()) & Counter(str(token.lemma_).lower())

                if sum(common_letters.values()) >= len(str(token))-1:
                    text+=" "+str(token.lemma_).lower()
                else:
                    text+=" "+str(token).lower()

    return text

df['processed_text'] = df['processed_text'].apply(lambda x: normalize_text(x))

In [10]:
df.head(3).style

Unnamed: 0,URL,datetime,heading,text,processed_text
0,https://www.scmp.com/news/hong-kong/education/article/3195500/pity-children-hong-kong-counts-cost-months-virtual-lessons,2022-10-11T00:00:11.000Z,"Pity the children: Hong Kong counts the cost of months of virtual lessons amid the Covid pandemic, with no time to play, make friends","Younger children struggling to adjust to school life, some teens have anxiety, emotional issuesExperts concerned about youngsters’ greater use of electronic devices, rise in child suicides too",younger children struggling to adjust school life ten have anxiety emotional issuesexpert concerned youngster greater use electronic device rise child suicide too
1,https://www.scmp.com/news/hong-kong/politics/article/3195492/hong-kong-protests-young-people-jailed-over-unrest-should,2022-10-10T23:30:10.000Z,"Hong Kong protests: young people jailed over unrest should be allowed to contribute to society on release, rehabilitation group says","Protesters jailed over 2019 anti-government protests should be used to help beat brain drain and be given more support to continue education behind bars, group saysProject Change appeals to professional bodies to relax rules on members and applicants who had fallen foul of the law",protester jailed 2019 anti government protest should be use to help beat brain drain and be give more support to continue education bar group saysproject change appeal professional bodies to relax rule member and applicant have fallen foul law
2,https://www.scmp.com/news/hong-kong/politics/article/3195498/russian-consulate-hong-kong-given-advanced-notice-arrival,2022-10-10T15:40:48.000Z,Russian consulate in Hong Kong given advanced notice for arrival of superyacht linked to sanctioned billionaire,"Russia’s top envoy in city says he ‘fully agreed’ with decision by Hong Kong Marine Department to refrain from implementing sanctions issued by third countryArrival of the superyacht has led to a diplomatic row between the US and Beijing, with former saying Hong Kong’s reputation is at stake",russia is top envoy city say fully agree decision long long marine department to refrain implementing sanction issue third countryarrival superyacht have lead diplomatic row and seizing former saying long long is reputation is stake


#### remove additional stop words

I decided to go here for a list that does not remove much words the standard NLTK or Spacy stop word list
Spacys or NLTK Stopword list for example would remove words that are actually important - as I found out later.
And therefore would rahter lead to more issues instead of giving meaningful help

In [12]:
# custom stoplist
customized_stopwords = ["and", "or", "either", "to", 
                        "such", "sure", "so", "also", "usually", "just",
                        "but", "however", "additionally", "furthermore", "while", "besides",
                        "are", "is", "be", "will", "make", "am", "has", "may", "might", "would", "can", "should",
                        "allow", "make", "use"]

def remove_stop_words(text):
    tokenized_text = str(text).split(" ")
    new_text = ""
    
    for token in tokenized_text:
        # Check if the word is in NLTKs stopword list
        if token in customized_stopwords:
              pass
        else:             
            new_text += " " + str(token)
    
    if len(new_text) > 5:
        return new_text
    else:
        return text

df['processed_text'] = df['processed_text'].apply(lambda x: remove_stop_words(x))

In [13]:
df.head(3).style

Unnamed: 0,URL,datetime,heading,text,processed_text
0,https://www.scmp.com/news/hong-kong/education/article/3195500/pity-children-hong-kong-counts-cost-months-virtual-lessons,2022-10-11T00:00:11.000Z,"Pity the children: Hong Kong counts the cost of months of virtual lessons amid the Covid pandemic, with no time to play, make friends","Younger children struggling to adjust to school life, some teens have anxiety, emotional issuesExperts concerned about youngsters’ greater use of electronic devices, rise in child suicides too",younger children struggling adjust school life ten have anxiety emotional issuesexpert concerned youngster greater electronic device rise child suicide too
1,https://www.scmp.com/news/hong-kong/politics/article/3195492/hong-kong-protests-young-people-jailed-over-unrest-should,2022-10-10T23:30:10.000Z,"Hong Kong protests: young people jailed over unrest should be allowed to contribute to society on release, rehabilitation group says","Protesters jailed over 2019 anti-government protests should be used to help beat brain drain and be given more support to continue education behind bars, group saysProject Change appeals to professional bodies to relax rules on members and applicants who had fallen foul of the law",protester jailed 2019 anti government protest help beat brain drain give more support continue education bar group saysproject change appeal professional bodies relax rule member applicant have fallen foul law
2,https://www.scmp.com/news/hong-kong/politics/article/3195498/russian-consulate-hong-kong-given-advanced-notice-arrival,2022-10-10T15:40:48.000Z,Russian consulate in Hong Kong given advanced notice for arrival of superyacht linked to sanctioned billionaire,"Russia’s top envoy in city says he ‘fully agreed’ with decision by Hong Kong Marine Department to refrain from implementing sanctions issued by third countryArrival of the superyacht has led to a diplomatic row between the US and Beijing, with former saying Hong Kong’s reputation is at stake",russia top envoy city say fully agree decision long long marine department refrain implementing sanction issue third countryarrival superyacht have lead diplomatic row seizing former saying long long reputation stake


In [20]:
 # Note we can create piplines to thest out out steps in sequences
    #  READ:
    # https://towardsdatascience.com/how-to-use-sklearn-pipelines-for-ridiculously-neat-code-a61ab66ca90d

def apply_normalize_text(df, column='text'):
    df[column] = pd.DataFrame(df[column].apply(normalize_text))
    return df



from sklearn.pipeline import Pipeline
#p = Pipeline([
            #enter different steps
#])

