### Visit inshorts website and inspect the HTML of the news page to look for the required scraping

In [15]:
# Importing required libraries and modules for scraping and analysis
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [16]:
# Function to crawl news from inshorts and build dataframe

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        #print(news_category)
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        #print(soup)
        news_articles = [{'news_headline': headline.find('span', attrs={"itemprop": "headline"}).string,
                    'news_article': article.find('div', attrs={"itemprop": "articleBody"}).string,
 'news_category': news_category}
                        
                     for headline, article in zip(soup.find_all('div', class_=["news-card-title news-right-box"]),
 soup.find_all('div', class_=["news-card-content news-right-box"]))]
    
        news_data.extend(news_articles)
    
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df
    

In [17]:
# List of categories, here technology, sports and world
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

In [18]:
#Create news dataframe 
news_df = build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,Twitter down for users worldwide; notification...,Twitter and TweetDeck went down for some users...,technology
1,YouTube allegedly demonetises videos with 'gay...,"Three YouTubers, after testing over 15,000 com...",technology
2,Tesla buys self-driving car AI vision technolo...,Elon Musk-led electric automaker Tesla has rep...,technology
3,TikTok now past Instagram in India in terms of...,Facebook CEO Mark Zuckerberg said that he thin...,technology
4,Facebook fumbles its duty to protect democracy...,US Democratic presidential candidate Elizabeth...,technology
5,Google contractors targeted 'dark skin' users ...,"Some Google contractors, working for face unlo...",technology
6,Apple CEO opposes Trump's immigration policy i...,"Apple and its CEO Tim Cook, along with retail ...",technology
7,"Hacked 6,000 accounts for sexual content, admi...","A former Yahoo engineer, 34-year-old Reyes Dan...",technology
8,Startup sues Apple for trademark abuse over 'M...,US-based app developer startup Social Technolo...,technology
9,Facebook can defeat US attempt to break it up:...,"Facebook CEO Mark Zuckerberg, in leaked audio ...",technology


In [19]:
news_df[news_df['news_category'] == 'sports'].head()

Unnamed: 0,news_headline,news_article,news_category
25,"Opening for first time in Test cricket, Rohit ...","Opening for the first time in Test cricket, In...",sports
26,Was busy winning golds for India: Advani on wa...,"Indian cueist Pankaj Advani, who recently took...",sports
27,Federer asks for B'wood film suggestions on Tw...,Swiss tennis star Roger Federer on Wednesday t...,sports
28,15-yr-old Shafali trained as a boy as no Rohta...,"Sanjeev Verma, father of India's youngest T20I...",sports
29,Bookie who asked KPL bowler to give more than ...,Bengaluru Police on Wednesday said it has arre...,sports


In [20]:
# new_df = build_dataset(['https://inshorts.com/en/read'])
# new_df

In [21]:
news_df['news_category'].value_counts()

technology    25
sports        25
world         25
Name: news_category, dtype: int64

# Text Wrangling & Pre-processing

## How to load spacy, given in Readme file

In [22]:
#Required libraries and files
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

In [23]:
spacy.load('en_core_web_md')

<spacy.lang.en.English at 0x7f6d4f0da160>

In [24]:
nlp = spacy.load('en_core_web_md', parse=True, tag=True, entity=True)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, #entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

#### Removing HTML tags

In [25]:
# Strip HTML tags if present
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

#### Removing accented characters

In [26]:
# Remove accented characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

#### Expanding Contractions

In [27]:
# Expanding the contractions using CONTRACTION_MAP dictionary
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

#### Removing special characters

In [28]:
#Removing special characters
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

#### Stemming

In [29]:
# Stemming technique
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

#### Lemmatization

In [30]:
# Lemmatization technique
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crashed yesterday , ours crash daily'

#### Removing Stopwords

In [31]:
#Stopwords removal
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

## Everything at one place 

In [32]:
# Function to pre-process the text in one go
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [33]:
# combining headline and article text
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]

# pre-process text and store the same
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])

# show a sample news article
news_df.iloc[1][['full_text', 'clean_text']].to_dict()


{'full_text': 'YouTube allegedly demonetises videos with \'gay\', \'lesbian\' in title. Three YouTubers, after testing over 15,000 common words and phrases on YouTube\'s automated systems, claimed that the platform demonetises videos with words like "gay". "Gay" and "lesbian" when replaced with "happy" and "friend" in video titles changed the videos\' status to advertiser-friendly, the YouTubers claimed. YouTube said that there is no list of LGBTQ words that trigger demonetisation.',
 'clean_text': 'youtube allegedly demonetises video gay lesbian title three youtuber test common word phrase youtubes automate system claim platform demonetises video word like gay gay lesbian replace happy friend video title change video status advertiser friendly youtubers claim youtube say no list lgbtq word trigger demonetisation'}