#### Importing the required Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from nltk.tokenize import sent_tokenize
import spacy
import textacy
import datetime
nlp = spacy.load('en_core_web_sm')

#### This Function specifically scrapes out the link and get the data from the news link displayed on Google

In [22]:
def content(link):
    
    """
        This function goes inside the link of every text and scraps data from the each news headline
        in parallel
        
    """
    try:
        # Requesting the Scraped Link
        time.sleep(2)     # Reducing number of pings to avoid ip blocks though not neccessary at this stage
        source = requests.get(link[1:], verify = False)
        
        # Check if the request is successful
        if source.status_code == 200:
            
            # Parsing the link text
            source = BeautifulSoup(source.text, 'lxml')
            context = ' '
            
            # Finding the Paragraph text using 'p' tag
            for data in source.find_all('p'):
                
                # Implementing a bit of cleaning text at initial stage
                if len(data) < 2:
                    data = data.text.replace('  ', '')
                    context = context + ' ' + data.rstrip().lstrip().rstrip('\n').lstrip('\n')                   
        else:
            
            print('Requested site for content did not respond. Status Code is ', source.status_code)
    except Exception as e:
        print(str(e))
            
    return context

#### This function helps in retrieving the time and news source

In [27]:
def retrieve_time(source):
    
    for element in source.find_all('div', class_ = 'slp'):
        #for elem in element.find_all('span'):
        news_source, time = element.text.split('-')[0].rstrip().lstrip(), element.text.split('-')[1].rstrip().lstrip()
        print(news_source, time, type(news_source))
        #print('{}-{}'.format(news_source, time))
        if 'hours' in time or 'minutes' in time or 'seconds' in time or 'hour' in time or 'minute' in time or 'second' in time:
            return news_source, str(datetime.datetime.utcnow().date())
        elif 'days' in time or 'day' in time:
            return news_source, str((datetime.datetime.utcnow() - datetime.timedelta(days = int(time.split()[0]) + 1)).date())
        else:
            return news_source, time

#### This function requests the original google page for the search results

In [28]:
def scrap_headline_window(keys, number_of_news, days):
    
    """
        This function hits the google page and scraps the 
    """

    # Defining a dataframe to store the data and later saving it to excel
    test_dataframe = pd.DataFrame(columns = ['News', 'Content', 'Link', 'News_Source', 'Time'])
    
    # Putting it into try-catch to handle any error    
    try:
        
        # Looping through the pages
        for page in range(0, number_of_news, 10):
            print('Scraping_headline_window')
        
        # Defining a baseline URL 
            url= 'https://www.google.com/search?q=insurance+%22{}%22&rlz=1C1GGRV_enIN790IN790&tbs=qdr:d{}&tbm=nws&ei=FlJhXLiKNNDikgXCkZTwDA&start={}&sa=N&ved=0ahUKEwi47dT2v7PgAhVQsaQKHcIIBc4Q8tMDCFE&biw=1440&bih=758&dpr=1'.format(keys, days, page)

        # Hitting the URL and Parsing it
            source = requests.get(url)
            time.sleep(5)      # Reducing overload condition on the server else the IP might get blocked

            # Checking the Status code for successful request
            if source.status_code == 200:
                print('Request Successful')

                source = BeautifulSoup(source.text, 'lxml')
                # Scraping the news and the respective links
                for elements in source.find_all('div', class_ = 'g'):
                    data = []
                    heading = elements.find('h3').text
                    link = '.' + elements.find('a')['href'].split('=')[1][:-3]
                    context = cleaning_text(content(link))
                    news_source, event_time = retrieve_time(elements)
                    data.append([heading, context, link, news_source, event_time])
                    sample_dataframe = pd.DataFrame(data = data, columns = ['News', 'Content', 'Link', 'News_Source', 'Time'])
                    test_dataframe = pd.concat([test_dataframe, sample_dataframe], axis = 0)
                    print(data)
                    print()


            else:

                print('Request failed with status code', source.status_code())

    except Exception as e:
        
        print(str(e))
        pass

        # Writing to an Excel File 

#    test_dataframe.to_excel('D:\\Users\\kekishor\\Desktop\\project\\filtered_extract\\' + name + '.xlsx')    
    return test_dataframe

#### Implementing the Cleaning Process

In [29]:
def cleaning_text(content):
    
    """
    
        Cleaning the text process involves the following steps
            
            -- Implementation of striping spaces and new line
                    This step has been implemented in the function 'Content' while scraping the data
            -- Implementation of sentence tokenizer 
                    To process each sentence individually
            -- Implementation of stopwords
                    These keywords are manually prepared for the obvious keys which shows waste data
            -- Sentence Less than 4 words should be removed. High chances of being an ad
            
    """
    filtered_message = ''  # Resetting the filtered message for every token
    
    # Implementing the doc for using it to split using spacy
    
    try:          # Wrapping this because some of the content field is empty
        
        doc = nlp(content)
        for word in doc.sents:
            
            # Wrapping in try-catch to ensure exception handling
            try:
                
                # Since the word is of span class, thus type casting it to be a string
                word = str(word)
                spam = 0      # Setting the spam flag, whenever it is set to 1, the sentence will be discarded
                    
                for ad_word in stopword:     # Checking for stopword
                    if (ad_word in word.lower()):
                        spam = 1
                        break
                        
                if len(word.split())< 5:  # Checking for length of the sentence
                    spam = 1
                
                if spam == 0:     # If flag does not changes, then adding it to the filtered message
                    filtered_message = filtered_message + word + ' '
                    
            except Exception as e:
                
                #print(str(e))
                pass
            
        #print(filtered_message)
        filtered_message = filtered_message.encode('ascii', 'ignore').decode('unicode_escape')
        filtered_message = textacy.preprocess.remove_punct(filtered_message, marks = '!,:;-()[]*{}\/')
        filtered_message =  textacy.preprocess_text(filtered_message, fix_unicode=True, lowercase=True, transliterate=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True, no_accents=True)
        return filtered_message
    
    except Exception as e:          # Returning space when the content is empty 
        
        print('Exception Occured')
        print(str(e))
        #print(content)
        return ' '

#### Amount of window for which we need the data (No of Days)

In [6]:
days = int(input('Enter the Window for which we need to scrap the news '))

#### Specify the Number of News Headline which needs to be scraped

In [7]:
number_of_news = int(input('Enter the number of news headlines to be stored '))

#### Keys for which we need the data

In [8]:
keywords = [input('Enter the keyword that you would like to search')]

#### Execute this block only if you haven't executed the above block else the value will be overriden

In [12]:
keywords = ['ACCIDENT', 'PREMIUM' , 'SERVICES', 'PRODUCT', 'RISK', 'RATE', 'BROKER', 
            'BUSINESS', 'CANCELLATION', 'CLAIM', 'CLAUSE', 'COVERAGE', 'INSURANCE', 'FORGERY', 
            'GRACE PERIOD', 'INDEMNITY', 'INSURABILITY', 'LAPSE', 'LESSOR', 'LIABILITY', 'LOSS', 
            'MORTGAGE', 'POLICY', 'REIMBURSEMENT', 'REINSTATEMENT', 'RENEWAL', 'SETTLEMENT', 'VALUATION', 
            'WAIVER', 'ACQUISITION']

#### This stopwords file can be enhanced more to enrich the data

In [9]:
with open('stopwords.txt') as ads_word:
    
    """    
        The sentences containing these words will removed from the content we got        
    """   
    
    stopword = []
    for i in ads_word.readlines():
        if len(i) > 1:
            stopword.append(i)

#### Loop all the company names with each of these keywords (Might take a bit)

In [30]:
final_data = pd.DataFrame(columns = ['News', 'Content', 'Link', 'News_Source', 'Time'])
for keys  in keywords:
    final_data = pd.concat([final_data, scrap_headline_window(keys, number_of_news, days)], axis = 0)
# final_data.set_index('News', inplace = True)


# # Process to write the data into the excel file as a backup for future purposes
# writer = pd.ExcelWriter(r'D:\\Users\\kekishor\\Desktop\\project\\filtered_extract\\' + company + '.xlsx', engine='xlsxwriter')
# final_data.to_excel(writer)
# workbook = writer.book
# worksheet = writer.sheets['Sheet1']
# wrap_format = workbook.add_format({'text_wrap' : True})

# worksheet.set_column('A:A', 70, wrap_format)
# worksheet.set_column('B:B', 100, wrap_format)
# worksheet.set_column('C:C', 80, wrap_format)
# worksheet.set_column('D:D', 20, wrap_format)

# writer.save()
# writer.close()

Scraping_headline_window
Request Successful




Moneycontrol.com 4 days ago <class 'str'>
[['A more holistic approach needed to to simplify health insurance claims', 'last month i attended a conference organised by a health insurer during a panel discussion the founder of a large broker shared his personal experience of making a health insurance claim in the us he said that when the doctor gave the green signal for discharge his family was free to go i did not have to wait for a tpa desk to send a discharge summary insurer to ask questions the hospital to answer and finally haggle over deductions i could just pick up my bag and leave he said in contrast the discharge in indian hospitals is difficult patients wait between 3 to 6 hours for approvals to come in sometimes patients give up pay the hospital and leave only to go through a long claim reimbursement process later the panel deliberated a few interesting and some contentious issues but all had a common view we are meant to serve as a bridge between the policyholder care receive



ABC News 1 day ago <class 'str'>
[["North Queensland flood victims need banks and insurers to be 'good ...", 'banks and insurers should lift their socks and show compassion to north queenslanders affected by the floods says deputy premier jackie trad insurers had received 13 560 claims as of 10 00am on sunday with losses estimated at usd165 million the insurance council of australia ica said the council said insurers had already paid more than usd16 million in support and emergency accommodation to policyholders the people of townsville the people in the north west are traumatised and they are doing it tough right now ms trad said what they want from their insurance companies and what they want from their banks is sympathy and they want compassion and they want a responsive organisation that understands that they are going through a traumatised event how many more natural disasters how many more queenslanders are going to be left without home and contents having to rebuild their lives 



RACQ Live 1 day ago <class 'str'>
[['Influx of insurance claims for flood damage', 'follow us as locals continued to count the cost of the north queensland floods racq urged affected members who had not already started their insurance claim to kick off the process as of sunday racq had received more than 2100 insurance claims for damage to property and motor vehicles townsville suburbs including idalia rosllea and hermit park were among those hardest hit making up almost 850 of the total claims alone racq spokesperson paul turner said the club expected further claims from these areas as more people returned home to assess the damage the flood clean up what to do when you return home as more people assess property damage were expecting to hear from more of our members in these locations mr turner said our claims assessors and builders have begun to assess flood damage in townsville and were working as fast and as hard as possible to get to all our members', '.https://live.racq.com.au/20



Connected to India 1 hour ago <class 'str'>
[['Pravasi insurance settles 812 claims to the tune of INR 633 million', 'as many as 812 insurance claims of deaths and disabilities among the indian professionals emigrating to various countries have been settled in past four years under pravasi bharatiya bima yojana scheme responding to a query in rajya sabha indian minister of state for external affairs gen retired v k singh informed the house that an amount to the tune of inr 633 54 million has been disbursed by the insurance companies against these claims in the given period on that other hand the companies also collected a premium of inr 802 55 million during the same period from 1 april 2014 till 31 december 2018 under pravasi bharatiye bima yojana a mandatory insurance scheme for all emigration check required ecr category professionals and workers going to ecr countries the question was asked by anto antony punnathaniyil a congress mp from kerala who wanted to know whether the governm



The Hindu 3 days ago <class 'str'>





Insurance Business New Zealand 4 days ago <class 'str'>
[['Revealed: Weirdest NZ pet insurance claims of 2018', 'southern cross pet insurance has released its 2018 most unusual pet insurance claims report and it features claims ranging from a french bulldog eating a roll of dental floss to a veterinarian treating their first case of an intersex cat the list shows how even everyday accidents can end up costing a lot of money at the veterinary clinic southern cross pet insurance general manager anthony mcphail said most of the claims we process each year are for fairly typical illnesses and routine checks but we also receive some that surprise us particularly the range of ways pets find to test the limits of their digestive system he noted mcphail explained while claims such as these are sometimes amusing they can require a serious procedure such as surgery which is stressful for both the pet and their owner the insurers 10 most unusual petcare claims include firm hit by catastrophes but



Fin24 27 Jan 2019 <class 'str'>
[['Insurance claims 101: Your questions answered', 'links to the websites of major municipalities which have published schedules how jeff bezos could take down the national enquirer if he wanted to sign up to receive fin24 s top news in your inbox every morning filing an insurance claim may not have been on your to do list but if\nyou do end up having to do it ideally you want as few hiccups as possible nthabiseng moloi miway insurance head of marketing and brand gave fin24\ninsight into some of the more commonly asked questions about insurance claims i\ninstalled a vehicle tracker so why was my claim rejected on the basis that my\ncar did not meet security requirements not all\ntracking devices are created equal says moloi certain insurers may insist on\nspecific models given the variance in functionality whereas some track speed others are optimised to detect impact and others can locate the vehicle ensure that\nyour tracking device is always in workin



Insurance Business UK 4 hours ago <class 'str'>
[['Total Claims Solutions boosts board', 'total claims solutions which launched a rebranded website in late january in time for the firms scottish expansion has added a new face to its roster of directors nigel allen former chief executive of slater gordon solutions motor has been tapped to become part of the total claims solutions board the move comes ahead of april 01 the day claims management companies cmcs including those in scotland start being regulated by the financial conduct authority fca oversight of cmcs will be transferred from the claims management regulator which is currently responsible for the regulatory regime in england and wales the regulation will be extended to scotland as per the financial guidance and claims act 2018 i am thrilled to have joined the board at total claims solutions commented allen who brings more than two decades of claims management experience to the role this is such a pivotal moment for claimants 



Canstar 7 hours ago <class 'str'>
[["What the banking royal commission's report could mean for insurance", 'in the final report commissioner kenneth hayne outlined a total of 15 recommendations relating to the insurance sector including an outright ban of the use of cold calling or hawking to sell insurance products during the commissions public hearings stories surfaced demonstrating aggressive sales tactics used by some insurance agents to make a sale including a case where a man with down syndrome was signed up for more than usd100 000 worth of life insurance policies he did not understand hayne said these cases showed why unsolicited sales calls should be prohibited as they could allow the unscrupulous to prey upon the unsuspecting canstar research and ratings manager mitchell watson said if this ban was to come into effect then insurers would only be allowed to discuss a product directly related to a consumers enquiry that will mean you the consumer are at a point where you may ha



The Tribune 6 days ago <class 'str'>
[['Hisar farmers on dharna over insurance claims', 'infighting in the city unit of the bjp turned murkier after a woman councillor of the party farmila reportedly tried to assault mayor rajesh kalia with a mike at a public function at shahpur colony near dadu majra today became the first hip hop track to win a song of the year grammy on sunday while lady gaga dominated the early awards and former us first lady michelle obama made a surprise appearance at the event deepender deswal tribune news service hisar february 4 the farmers said they were given an assurance by the district authorities on october 1 last year that they would be provided claims for their damaged crops but nothing had been done after untimely rain and hail damaged cotton and paddy crops in their fields farmers applied for insurance claims which were turned down on the plea that they had not deposited premium and their crops were not insured enraged farmers locked the bank branch i

In [31]:
final_data

Unnamed: 0,News,Content,Link,News_Source,Time
0,A more holistic approach needed to to simplify...,last month i attended a conference organised b...,.https://www.moneycontrol.com/news/business/pe...,Moneycontrol.com,2019-02-06
0,North Queensland flood victims need banks and ...,banks and insurers should lift their socks and...,.https://www.abc.net.au/news/2019-02-10/queens...,ABC News,2019-02-09
0,Influx of insurance claims for flood damage,follow us as locals continued to count the cos...,.https://live.racq.com.au/2019/02/influx-of-in...,RACQ Live,2019-02-09
0,Pravasi insurance settles 812 claims to the tu...,as many as 812 insurance claims of deaths and ...,.https://www.connectedtoindia.com/pravasi-insu...,Connected to India,2019-02-11
0,Weather queries to back insurance claims on th...,the regional meteorological centre in the last...,.https://www.thehindu.com/news/cities/chennai/...,The Hindu,2019-02-07
0,Revealed: Weirdest NZ pet insurance claims of ...,southern cross pet insurance has released its ...,.https://www.insurancebusinessmag.com/nz/news/...,Insurance Business New Zealand,2019-02-06
0,Insurance claims 101: Your questions answered,links to the websites of major municipalities ...,.https://www.fin24.com/Money/insurance-claims-...,Fin24,27 Jan 2019
0,Total Claims Solutions boosts board,total claims solutions which launched a rebran...,.https://www.insurancebusinessmag.com/uk/news/...,Insurance Business UK,2019-02-11
0,What the banking royal commission's report cou...,in the final report commissioner kenneth hayne...,.https://www.canstar.com.au/car-insurance/bank...,Canstar,2019-02-11
0,Hisar farmers on dharna over insurance claims,infighting in the city unit of the bjp turned ...,.https://www.tribuneindia.com/news/haryana/his...,The Tribune,2019-02-04


# ---------------------------------------------------------------------------------------

['claims']