# Feature extraction

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

Font:

https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/NLP%20Strategy%20I%20-%20Processing%20and%20Understanding%20Text.ipynb

https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/

In [2]:

seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [3]:
news_df = build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,Google takes blame for Aadhaar helpline number...,After Aadhaar governing body UIDAI denied aski...,technology
1,Defence Minister gives 11 open challenges to s...,Defence Minister Nirmala Sitharaman launched t...,technology
2,Microsoft once invested $150 million in near-b...,"In 1997, ousted CEO Steve Jobs returned to App...",technology
3,Helpline number in a mobile's contact list can...,"Aadhaar's governing body UIDAI on Sunday said,...",technology
4,Google Maps location sharing to show battery p...,Tech giant Google has started showing users' b...,technology
5,Cybercriminals use fake Income Tax refund SMS ...,A cybercrime racket is under investigation for...,technology
6,Government panel proposes localisation of clou...,A panel working on Indian government's cloud c...,technology
7,Facebook begins testing its dating feature,Two months after announcing its dating feature...,technology
8,Apple's value equals GDP of 16th largest econo...,US consumer technology giant Apple's $1 trilli...,technology
9,Google limits employees’ access to China searc...,Tech giant Google has limited its employees' a...,technology


In [4]:
news_df.news_category.value_counts()

world         25
sports        25
technology    23
Name: news_category, dtype: int64

# Basic feature extraction using text data

## Number of words

In [5]:
def word_count(label, column):
    news_df[label] = news_df[column].apply(lambda x: len(str(x).split(" ")))
    return news_df[[column, label]]

In [6]:
word_count('news_headline_word_count', 'news_headline').head()

Unnamed: 0,news_headline,news_headline_word_count
0,Google takes blame for Aadhaar helpline number...,10
1,Defence Minister gives 11 open challenges to s...,8
2,Microsoft once invested $150 million in near-b...,8
3,Helpline number in a mobile's contact list can...,11
4,Google Maps location sharing to show battery p...,8


In [7]:
word_count('news_article_word_count', 'news_article').head()

Unnamed: 0,news_article,news_article_word_count
0,After Aadhaar governing body UIDAI denied aski...,60
1,Defence Minister Nirmala Sitharaman launched t...,54
2,"In 1997, ousted CEO Steve Jobs returned to App...",60
3,"Aadhaar's governing body UIDAI on Sunday said,...",60
4,Tech giant Google has started showing users' b...,59


## Number of characters

In [8]:
def char_count(label, column):
    news_df[label] = news_df[column].str.len()
    return news_df[[column, label]]

In [9]:
char_count('news_article_char_count', 'news_article').head()

Unnamed: 0,news_article,news_article_char_count
0,After Aadhaar governing body UIDAI denied aski...,397
1,Defence Minister Nirmala Sitharaman launched t...,397
2,"In 1997, ousted CEO Steve Jobs returned to App...",359
3,"Aadhaar's governing body UIDAI on Sunday said,...",357
4,Tech giant Google has started showing users' b...,350


In [10]:
char_count('news_headline_char_count', 'news_headline').head()

Unnamed: 0,news_headline,news_headline_char_count
0,Google takes blame for Aadhaar helpline number...,64
1,Defence Minister gives 11 open challenges to s...,53
2,Microsoft once invested $150 million in near-b...,59
3,Helpline number in a mobile's contact list can...,66
4,Google Maps location sharing to show battery p...,55


## Average word length

In [11]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words )/len(words))

In [12]:
news_df['avg_word'] = news_df['news_article'].apply(lambda x: avg_word(x))
news_df[['news_article', 'avg_word']].head()

Unnamed: 0,news_article,avg_word
0,After Aadhaar governing body UIDAI denied aski...,5.633333
1,Defence Minister Nirmala Sitharaman launched t...,6.37037
2,"In 1997, ousted CEO Steve Jobs returned to App...",5.0
3,"Aadhaar's governing body UIDAI on Sunday said,...",4.966667
4,Tech giant Google has started showing users' b...,5.034483
