## Web Scraping to build a Football News Aggregator

### 1. Import the necessary libraries
### 2. Provide the url to the home page for Spanish soccer news
### 3. Hit the url and fetch the response
### 4. Extract the HTML and identify the tags to La Liga news articles’ links
### 5. Collect all the links through BeautifulSoup
### 6. Crawl each of the collected links
### 7. Extract title and published date through BeautifulSoup after inspecting HTML tags
### 8. As long as the designs for these websites don’t change, our news aggregator will work just fine

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

df = pd.DataFrame(columns=('title', 'date', 'link'))

############################################Givemesport.com#######################################################################
gms_url = "http://www.givemesport.com/spain-football"
gms_response = requests.get(gms_url)
gms_html = gms_response.text
gms_soup = BeautifulSoup(gms_html, "lxml")
headlines = []
for headline_tag in  gms_soup.find_all('h2', class_='gms-feature-title'):
    news_link = str('http://www.givemesport.com/')+headline_tag.find_all('a', href=True)[0]['href']
    if news_link not in headlines:
        try:
            headlines.append(news_link)
        except:
            pass
#print headlines

i = 0
for headline in headlines:
    response = requests.get(headline)
    html = response.text 
    soup = BeautifulSoup(html, "lxml")
    details = {'title' : soup.title.text, 
            'date' : soup.find_all('p', class_='gms-article-data')[0].find_all('time', datetime=True)[0]['datetime'][0:10],
            'link' : headline} 
    #print details
    df.loc[i] = [details['title'], 
                 details['date'], 
                 details['link']]

    i += 1
############################################Goal.com############################################################################   

goal_url = "http://www.goal.com/en-us/news/88/spain/archive/1?ICID=SP_TN_7_4_1"
goal_response = requests.get(goal_url)
goal_html = goal_response.text
goal_soup = BeautifulSoup(goal_html, "lxml")
headlines = []
for headline_tag in  goal_soup.find_all('h4', class_='headline'):
    news_link = str('http://www.goal.com')+headline_tag.find_all('a', href=True)[0]['href']
    if news_link not in headlines:
        try:
            headlines.append(news_link)
        except:
            pass
#print headlines

try:
    for headline in headlines:
        response = requests.get(headline)
        html = response.text 
        soup = BeautifulSoup(html, "lxml")
        details = {'title' : soup.title.text, 
                'date' : soup.find_all('div', class_='module module-article-body clearfix')[0]\
                   .find_all('time', datetime=True)[0]['datetime'][0:10],
                'link' : str(headline)} 
        #print details
        df.loc[i] = [details['title'], 
                     details['date'], 
                     details['link']]
        i += 1
except:
    pass

############################################Sportskeeda.com############################################################################
sportskeeda_url = "http://www.sportskeeda.com/go/la-liga"
sportskeeda_response = requests.get(sportskeeda_url)
sportskeeda_html = sportskeeda_response.text
sportskeeda_soup = BeautifulSoup(sportskeeda_html, "lxml")
headlines = []
for headline_tag in  sportskeeda_soup.find_all('div', class_='story-wrapper'):
    news_link = str('http://www.sportskeeda.com')+headline_tag.find_all('a', href=True)[0]['href']
    if news_link not in headlines:
        try:
            headlines.append(news_link)
        except:
            pass

for headline in headlines:
    response = requests.get(headline)
    html = response.text 
    soup = BeautifulSoup(html, "lxml")
    details = {'title' : soup.title.text, 
            'date' : str(soup.find("meta", {"property":"article:published_time"}, content=True))[15:25],
            'link' : str(headline)} 
    #print details
    if soup.find("meta", {"property":"article:published_time"}, content=True) is not None:
        df.loc[i] = [details['title'] + ' | Sportskeeda.com', 
                     details['date'], 
                     details['link']]
    else:
        continue
    i += 1
    
#print df.sort_values(by='date', ascending=False).reset_index().drop('index', axis=1)

In [10]:
pd.set_option('display.max_colwidth', 50)
#pd.set_option('display.max_colwidth', -1)
from IPython.display import display, HTML
print df.sort_values(by='date', ascending=False).reset_index().drop('index', axis=1).to_string(index=False)

title        date                                               link
Transfer news and rumours of the day: 16th Jan...  2017-01-16  http://www.sportskeeda.com/football/transfer-n...
Real Madrid transfer rumour: Los Blancos plot ...  2017-01-16  http://www.sportskeeda.com/football/real-madri...
Pogba: I rejected Barcelona and Real Madrid fo...  2017-01-16  http://www.goal.com/en-us/news/88/spain/2017/0...
Messi and Barcelona need each other - Maschera...  2017-01-16  http://www.goal.com/en-us/news/88/spain/2017/0...
Top 5 players who have scored against most clu...  2017-01-16  http://www.sportskeeda.com/football/top-5-play...
10 greatest Barcelona players of all time | Sp...  2017-01-16  http://www.sportskeeda.com/football/lionel-mes...
10 most expensive players in the transfer mark...  2017-01-16  http://www.sportskeeda.com/football/10-most-ex...
10 great players who were relegated  | Sportsk...  2017-01-16  http://www.sportskeeda.com/football/10-great-p...
4 facts you need to know ab

In [5]:
sortedDF = df.sort_values(by='date', ascending=False).reset_index().drop('index', axis=1)

In [6]:
#print sortedDF

In [8]:
length = len(sortedDF)
for i in range(0,length):
    with open("NewsAggregator.txt", "a") as myfile:
        myfile.write('##Article ' + str(i+1) + '##' + '\n')
        myfile.write(sortedDF['title'][i].encode('utf8')+'\n')
        myfile.write(sortedDF['date'][i].encode('utf8')+'\n')
        myfile.write(sortedDF['link'][i].encode('utf8')+'\n'+'\n')

In [9]:
length = len(sortedDF)
for i in range(0,length):
    print('##Article ' + str(i+1) + '##')
    print(sortedDF['title'][i].encode('utf8'))
    print(sortedDF['date'][i].encode('utf8'))
    print(sortedDF['link'][i].encode('utf8')+'\n')

##Article 1##
Transfer news and rumours of the day: 16th January, 2017 | Sportskeeda.com
2017-01-16
http://www.sportskeeda.com/football/transfer-news-and-rumours-of-the-day-16th-january-2017/

##Article 2##
Real Madrid transfer rumour: Los Blancos plot Isco-Dele Alli swap deal | Sportskeeda.com
2017-01-16
http://www.sportskeeda.com/football/real-madrid-transfer-rumour-los-blancos-isco-dele-alli/

##Article 3##
Pogba: I rejected Barcelona and Real Madrid for Man Utd - Goal.com
2017-01-16
http://www.goal.com/en-us/news/88/spain/2017/01/16/31601452/pogba-i-rejected-barcelona-and-real-madrid

##Article 4##
Messi and Barcelona need each other - Mascherano - Goal.com
2017-01-16
http://www.goal.com/en-us/news/88/spain/2017/01/16/31590172/messi-and-barcelona-need-each-other-mascherano

##Article 5##
Top 5 players who have scored against most clubs in La Liga | Sportskeeda.com
2017-01-16
http://www.sportskeeda.com/football/top-5-players-who-have-scored-against-most-clubs-la-liga/

##Article 6##