<a href="https://colab.research.google.com/github/hrushikeshsahu19/Bbc-news-article-extraction_AND_Implement-N-gram-next-phrase-prediction-model/blob/main/bbc_com_news__article_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq

**STEP 1**- Getting Started

In [None]:
#-Download the dataset (Essay_samples.csv) from your email.
#-Copy this notebook into your own google Colab account.
#-Upload the dataset on the notebook as below

from google.colab import files
uploaded = files.upload()

import io
import pandas as pd
df = pd.read_csv(io.BytesIO(uploaded['Essay_samples - Sheet1.csv']))

Saving Essay_samples - Sheet1.csv to Essay_samples - Sheet1.csv


**STEP 2** - Web Scrapping

In [None]:
#-Use BeautifulSoup library to scrape 50 articles from (www.bbc.com).
#-We will then use the scrapped articles to create a hybrid database.

#! pip install beautifulsoup4
#! pip install requests

import requests
from bs4 import BeautifulSoup

url = 'https://www.bbc.com/'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')


links = soup.find_all("a")
news_urls = []
for link in links:
    href = link.get("href")
    if href.startswith("/news") and href[-1].isdigit():
        news_url = url + href
        news_urls.append(news_url)

news_urls=news_urls[:50]

news_articles_list=[]
for url in news_urls:
  res = requests.get(url)
  html_page = res.content
  soup1 = BeautifulSoup(html_page, 'html.parser')
  t=soup1.find_all('p',{"class": "ssrcss-1q0x1qg-Paragraph eq5iqo00"})
  s = ""
  for i in t:
    s+=str(i.text)
  news_articles_list.append(s)

data=pd.DataFrame(news_articles_list,columns=['Essay Text'])
#Your web-scraping code goes here
#
#

#-Create a new pandas table and add the essay database and the scrapped articles.
#df_new = df_essay + df_scraped_articles
df_new = df.append([data])
df_new=df_new.reset_index()
df_new=df_new.drop(['index'],axis=1)
df_new.shape

(149, 1)

**STEP 3** - Spacy dependency parsing

In [None]:
#import Spacy library
import string
from spacy.symbols import *
import spacy
nlp = spacy.load('en')

#In case you are unfamiliar with dependency parsing, review the document attached with the mail.
def dep_parsing(t):
  t=t.lower()
  paragraphs=t.split('\n')
  phrases= set()
  for para in paragraphs:
    doc = nlp(para)
    sentences = [sentence.text for sentence in doc.sents]
    for sent in sentences:
      s=nlp(sent)
      for nc in s.noun_chunks:
          phrases.add(nc.text)
          phrases.add(doc[nc.root.left_edge.i:nc.root.right_edge.i+1].text)

  a=list(phrases)
  return a



#-Create a sequential list of phrases as they appear in the text.


df_new['words']=df_new['Essay Text'].apply(dep_parsing)
words=[]
for i in range(len(df_new)):
  words.extend(df_new['words'][i])

**STEP 4** - N-gram next phrase prediction model using either ML/DL

In [None]:
#Implement N-gram next phrase prediction model.
import numpy as np
from keras.optimizers import Adam
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1

model = Sequential()
model.add(LSTM(128, input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=10, shuffle=True).history


['the opportunity', 'fresh, new, crisp learning', 'still pitch black outside', 'to', 'the “bacon mentality']
oh, bacon:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save('keras_next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))
model = load_model('keras_next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

In [None]:
def prepare_input(text):
  x = np.zeros((1, WORD_LENGTH, len(unique_words)))
  for t, word in enumerate(text.split()):
    print(word)
    x[0, t, unique_word_index[word]] = 1
  return x
#prepare_input("It is not a lack".lower())

In [None]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [None]:
def predict_completions(text, n=3):
    if text == "":
        return("0")
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

In [None]:

seq='my house'
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 4))

Sequence:  my house
my
house
next possible words:  ['and', 'the end', 'will', 'have']


**STEP 5**- Fine tuning word embedding (GloVe or Word2Vec)

In [None]:


def get_noun(t):
  t=t.lower()
  paragraphs=t.split('\n')
  noun = []
  for para in paragraphs:
    doc = nlp(para)
    sentences = [sentence.text for sentence in doc.sents]
    for sent in sentences:
      s=nlp(sent)
      for token in s:
        if token.pos_=='NOUN':
          noun.append(str(token))
    return noun

noun_list=[]
for i in range(len(df_new['Essay Text'])):
  noun_list.append(get_noun(df_new['Essay Text'][i]))


from gensim.models import Word2Vec
model = Word2Vec(noun_list,size=300, window=5, min_count=1, workers=4)

**BONUS STEP**
Use the word-embedding model to promote/demote suggestions from the N-gram model.

In [None]:
from gensim.models import Phrases
bigram_transformer = Phrases(noun_list)
model = Word2Vec(bigram_transformer[noun_list], min_count=1)
def find_sim(word):
  sim_words = model.wv.most_similar(word)
  return sim_words[:5]
find_sim('right')

#This is a bonus step
#Using the word embedding model, filter out the distant/irrelevant suggestions from the N-gram next phrase prediction model.

#HINT-> extract the noun using the POS tag from each of the next phrase predictions (in the ngram model)
#Then find the distance of the NOUNS from the current NOUN in the word embedding model.
#Rank the nouns in terms of distance from each other.
#Retain only the top 5 results.



[('inquiries', 0.34626853466033936),
 ('legislation', 0.31402862071990967),
 ('soul', 0.30660194158554077),
 ('activities', 0.2766473591327667),
 ('pops', 0.2740941643714905)]