In [1]:
import pandas as pd
import numpy as np
import re
import math
import requests
from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import porter
nltk.download('punkt')
nltk.download('stopwords')
import matplotlib.pyplot as plt
import psycopg2 as pc
from IPython import display

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielstephensen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielstephensen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
SQL_database_login = "dbname=datascience user=postgres password=****"
SQLtables_path = "/Users/krist/Desktop/Uni/milestone/DataScienceRep01/SQLtables/"

## Function Definitions

In [2]:
#cleantext cleans the input string with the following functions: Characters are set to lowercase, 
#urls are substituted with <URL>, dates are substitured with <DATE>, emails are substitured with <EMAIL>
#numbers are substitured with <NUM>, newlines and non-letter characters are removed.
def cleantext(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'<|>', "", text)
    text = re.sub(r'(https?:\/\/)?w{0,3}\.?[a-z]+\.[a-z]\w*[\w\/-]*', "<URL>", text)
    text = re.sub(r'(jan\.?(uary)?|feb\.?(uary)?|mar\.?(ch)?|apr\.?(il)?|may|jun\.(e)?|jul\.(y)?|aug\.?(ust)?|sep\.?(tember)?|oct\.?(ober)?|nov\.?(ember)?|dec\.?(ember)?|monday|tuesday|wednesday|thursday|friday|saturday|sunday) (the )?\d{1,2}((th)?,?( \d{4})?)?', "<DATE>", text)
    text = re.sub(r'\w+@\w+\.[a-zA-Z]{2,3}', "<EMAIL>", text)
    text = re.sub(r'[0-9]+', "<NUM>", text)
    text = re.sub(r'(\\n)+|\s{2,}|(\\t+)', " ", text)
    text = re.sub(r'\.|,|\\|-|\?|\(|\)|\||&|"|”|“|:|!|\+|-|–|—|\/|\$|%|€|#|;|\[|\]|©|®|…|=', "", text)
    return text

#cleanMetaKeywords cleans the input string with the following functions: 
#Characters are set to lowercase, newlines and non-letter characters are removed.
def cleanMetaKeywords(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'(\\n)+|\s{2,}|(\\t+)', " ", text)
    text = re.sub(r'\.|\\|-|\?|\(|\)|\||&|"|”|“|:|!|\+|-|\'|–|—|\/|\$|%|€|#|;|\[|\]|©|®|…|=|<|>', "", text)
    return text

def tokenize(text):
    return word_tokenize(text)

def stopword(word_list):
    stop_words = stopwords.words('english')
    
    return [word for word in word_list if word not in stop_words]

def stemming(word_list):
    stemmer = porter.PorterStemmer()

    return [stemmer.stem(word) for word in word_list]

def getSoup(url):
    response = requests.get(next_page)
    contents = response.content
    return BeautifulSoup(contents, 'html.parser')

def executeSQL(filename, cur):
    fd = open(filename, 'r')
    sqlFile = fd.read()
    fd.close()
    sqlCommands = sqlFile.split(';')
    for command in sqlCommands:
            cur.execute(command)

## Scraping data from Politics and Conflict

In [3]:
group_nr = 1
article_start_letters = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
print(article_start_letters)

BCDEFGHIJK


In [None]:
#The algorithm stops finding new articles when 'stop_searching' is set to True
stop_searching = False

#Finding the nextpage link in the first iteration is a little different, and therefore this value is needed
first_iteration = True

#The root url is the domain of wikinews
root_link = 'https://en.wikinews.org'

#next_page is the webpage that the algorithm searches for articles in next iteration of the while-loop
next_page = root_link + '/w/index.php?title=Category:Politics_and_conflicts'

#The links to the articles starting with the 'article_start_letters' are appended to 'article links'
article_links = []

#For each iteration this list gets some values if the first letter 
#of the first article in the next webpage is between A and K
first_letter_between_B_K = []

#A regex used for 'first_letter_between_B_K'
continue_iterations = re.compile(r"pagefrom=[A-K]")

In [None]:
while not(stop_searching):
    soup = getSoup(next_page)
    articles = soup.find(id="mw-pages")
    
    links = [link.get("href") for link in articles.find_all('a')]
    
    if first_iteration:
        first_letter_between_B_K = continue_iterations.findall(links[0])
        first_iteration = False
        next_page = root_link + links[0]
        article_links += [root_link + group_link for group_link in links[1:] if group_link[6] in article_start_letters]
    else:
        first_letter_between_B_K = continue_iterations.findall(links[1])
        next_page = root_link + links[1]
        article_links += [root_link + group_link for group_link in links[2:] if group_link[6] in article_start_letters]
    
    if len(first_letter_between_B_K) == 0:
        stop_searching = True
    
    first_letter_between_B_K = []

In [None]:
article_source_code = [getSoup(article) for article in article_links]

In [None]:
article_id = range(1,len(article_links))
article_titles = [article.find('h1').get_text() for article in article_source_code]
article_release_date = [str(article.find(id="publishDate"))[50:60] for article in article_source_code]
article_urls = article_links
article_content = [" ".join([p.get_text() for p in ((article.find(id="mw-content-text")).find(class="mw-parser-output")).findall('p')]) for article in article_source_code]
article_sources = [", ".join([element.get('href') for element in ((article.find('ul')).find_all('a', rel = 'nofollow', class_ ='external text'))]) for article in article_source_code]

In [None]:
scraped_articles = pd.DataFrame()

scraped_articles['id'] = article_id
scraped_articles['content'] = article_content
scraped_articles['title'] = article_titles
scraped_articles['release_date'] = article_release_date
scraped_articles['url'] = article_urls
scraped_articles['sources']= article_sources

scraped_articles

In [None]:
scraped_articles.to_csv("SQLtables/scraped_articles.csv",index=False,header=False)

In [None]:
#Setting up a connection with the SQL server. Make sure that you write your own dbname, user and password as input
conn = pc.connect(SQL_database_login)
cur = conn.cursor()

In [None]:
executeSQL('SQLfiles/createTableScraped.sql', cur)