# Scrapping Data from Inshorts
## Installing libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Defining urls

In [2]:
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

## Scrap using beautiful soup

In [3]:
def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')

        news_articles = [{'news_headline': headline.find('span',
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div',
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}

                            for headline, article in
                             zip(soup.find_all('div',
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div',
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)

    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

## Making dataframe

In [4]:
news_df = build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,OPPO Watch series with AMOLED dual-curved disp...,"OPPO Watch series, equipped with OPPO's own Du...",technology
1,OnePlus 8 Series to go on sale during Amazon F...,OnePlus 8 Series will be available during the ...,technology
2,COVID-19 pandemic should end for the rich worl...,Microsoft Co-founder Bill Gates said in an int...,technology
3,Most worthless results of any country in world...,"Microsoft Co-founder Bill Gates claimed it's ""...",technology
4,PM inaugurates submarine Optical Fibre Cable c...,Prime Minister Narendra Modi on Monday inaugur...,technology
5,US tech giants to oppose India's plan to regul...,"A group representing US tech giants Amazon, Fa...",technology
6,Toshiba officially quits the laptop business a...,Tokyo-based Toshiba has quietly exited the lap...,technology
7,Alphabet trains staff to avoid words like 'dom...,Google-parent Alphabet trains employees to avo...,technology
8,Reddit probes attack after forums 'vandalised'...,Reddit is investigating an attack which saw ma...,technology
9,EA shareholders vote to reject the video game ...,American video game company Electronic Arts' s...,technology


## Analyze a bit

In [5]:
news_df.news_category.value_counts()

technology    25
world         25
sports        25
Name: news_category, dtype: int64