In [1]:
# START ORIENTING MYSELF WITH NLP 

In [None]:
# TO COVER
# PROCESSING AND UNDERSTANDING TEXT
    #web scraping, data wrangling pre-processing
# FEATURE ENGINEERING AND TEXT REPRESENTATION 
    # speech tagging, parsing, constituency and depency parsing
    # named entity recognition
# ANALYSIS--EMOTIONAL AND SENTIMENT ANALYSIS
 #supervised and unsupervised learning

In [None]:
# DATA--inshort.com[tech,sports,world news]
# CRISP-DM--Cross Industry Standard Process for Data Mining
# Text Docs
# Text pre-processing
# Text Parsing & Exploratory Data Analysis
# Text representation and Feature Engineering
# Modeling and pattern mining
# Evaluation Deployment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
#nltk.download() # downloaded first time 
#import urllib.request
#response=urllib.request.urlopen('https://en.wikipedia.org/wiki/SpaceX')
#html=response.read()
#print(html)


In [2]:
# create a function that generates datasets from urls
def build_data(seed_urls): 
    '''
    This functions takes url or urls as input and generates
    a dataframe of the headline, headline article
    categorized by news category
    '''
    import requests
    from bs4 import BeautifulSoup
    news_data=[]
    news_columns = ['news_headline','news_article','news_category']
    for url in seed_urls:
        news_cat=url.split('/')[-1] # define category from url
        data=requests.get(url) # get url 
        soup=BeautifulSoup(data.content,'html.parser')# parse html
        # generate a dictionary comprehension 
        news_articles = [{'news_headline': headline.find('span', 
                    attrs={"itemprop": "headline"}).string,
                    'news_article': article.find('div', 
                    attrs={"itemprop": "articleBody"}).string,
                    'news_category': news_cat}
                    for headline, article in zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
    df = pd.DataFrame(news_data)[news_columns]
    return df
    

In [3]:
# RUN function to get todays news by category 
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world'] # 
today=build_data(seed_urls) # todays news 

In [4]:
# exploratory data analysis
today.news_category.value_counts()

technology    25
sports        25
world         24
Name: news_category, dtype: int64

In [5]:
# TEXT WRANGLING AND PRE-PROCESSING
    # removing HTML tags
    # expanding contractions
    # removing accented and special characters  
# need installation of spacy
 # install with pip3 -U spacy
 # python -m spacy download en # download language mode
   # english,german,spanish
 # restart the kennel if after above it does not work


In [120]:
def normalizer(corpus,strip_html=True,accent=True,special_char_removal=True,
    remove_digits=True,expansion=True,lower_case=True,
    stemmer=False,lemmatize=True,stopword=True):
    '''
    Functions removes html tags, accented words,
    and expands contractions(requires contraction dictionay,
    also removes_special_characters. default setting false
    '''
    import nltk
    from nltk.tokenize.toktok import ToktokTokenizer
    import re
    import spacy 
    from bs4 import BeautifulSoup
    import unicodedata
    from contractions import CONTRACTION_MAP 
    # this file is in usr/local/bin
    # downloaded from https://github.com/dipanjanS\
    # /practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/contractions.py
    import unicodedata
    # DEFINE executables and list from modules 
    nlp = spacy.load('en', parse=True, 
                     tag=True, entity=True)
    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')
    stopword_list.remove('no')
    stopword_list.remove('not')
    expansion_map=CONTRACTION_MAP
    def expand_match(contraction):
        #print('print this',contraction)
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = expansion_map.get(match) if expansion_map.get(match)\
        else expansion_map.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    try: 
        normalized_corpus=[]
        for doc in corpus[:5]:
           # print(doc)
            if strip_html:# strip HTML
                soup = BeautifulSoup(doc, 'html.parser')
                doc = soup.get_text()
                
            if accent:# Removing accented characters 
                doc=unicodedata.normalize('NFKD',doc).\
                    encode('ascii', 'ignore').\
                    decode('utf-8', 'ignore')
            if special_char_removal:# remove special xter and numbers
                special_char_pattern = re.compile(r'([{.(-)!}])')
                doc = special_char_pattern.sub(" \\1 ", doc)
                if remove_digits:
                    pattern = r'[^a-zA-z\s]'
                    doc = re.sub(pattern, '',doc)       
                else:
                    pattern = r'[^a-zA-z0-9\s]'
                    doc = re.sub(pattern, '',doc)
            if expansion and expansion_map:# expand contractions
                contraction_pattern = re.compile('({})'.\
                format('|'.join(expansion_map.keys())), 
                flags=re.IGNORECASE|re.DOTALL)
                expanded_text = contraction_pattern.sub(expand_match,doc)
                doc = re.sub("'", "" , expanded_text)
            if stemmer:# get word base form
                ps=nltk.porter.PorterStemmer()
                doc=' '.join([ps.stem(word) for word in doc.split()])
            if lemmatize:# remove word affixes get word base form
                text=nlp(doc)
                doc=' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
                #print(doc)
            if stopword:# remove words with little or no significance
                tokens=tokenizer.tokenize(doc)
                tokens=[token.strip() for token in tokens]
                if lower_case:
                    filtered_tokens=[token for token in tokens\
                                     if token not in stopword_list]
                else:
                    filtered_tokens = [token for token in tokens
                                       if token.lower() not in stopword_list]
                doc = ' '.join(filtered_tokens)  
                # remove extra white space
                doc = re.sub(' +', ' ',doc)
                normalized_corpus.append(doc)
        return normalized_corpus               
    except Exception as e:
        print(e)

In [121]:
today=build_data(seed_urls)# scrape data 
#combine news headline with article
today['full_text'] = today.news_headline.map(str) +\
  '. ' + today.news_article 


In [122]:
p

['oppo launch new f pro mpmp dual rear camera allnew oppo f pro enable user capture low light moment ai driven mpmp rear camera It feature panoramic screen display vooc take flash charge technology step ahead smartphone also power p chipset faster smart robust performance',
 'oneplus march madness campaign offer various deal oneplus oneplus announce march madness campaign part oneplus available several offer across amazon oneplus reliance digital outlet croma outlet oneplus offline store offer include nocost emi month discount exchange among various additionally offer special giveaway form accessory bundle',
 'google find pay man less woman job google internal audit find company pay man work level software engineers less discretionary fund woman role google end pay million pay adjustment employee year include new employee however google not specify many employee man',
 'indian govt website hack post jk attack report indian government website critical system hack within hour pulwama ter

In [None]:

            if stopword:# remove words with little or no significance
                tokens=tokenizer.tokenize(doc)
                tokens=[token.strip() for token in tokens]
                if lower_case:
                    filtered_tokens=[token for token in tokens\
                                     if token not in stopword_list]
                else:
                    filtered_tokens = [token for token in tokens
                                       if token.lower() not in stopword_list]
                doc = ' '.join(filtered_tokens)  
            # remove extra white space
                doc = re.sub(' +', ' ',doc)
            
                normalized_corpus.append(doc)
            
            return normalized_corpus