In [232]:
import pandas as pd
import numpy as np
import re
import math
import requests
from bs4 import BeautifulSoup

In [233]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import porter
nltk.download('punkt')
nltk.download('stopwords')
import matplotlib.pyplot as plt
import psycopg2 as pc


[nltk_data] Downloading package punkt to /home/lk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Task 1

Group members: Angelina Näsström (nzv947), Daniel Stephensen (fbp131), Kristina Wilke (mlt790), Lauritz Koch (hdg618)

## Task 2

We have used the following procedures: cleaning, tokenizing, removing stopwords and stemming the data. When cleaning the data we made sure of the following: 
1. all letters are in lowercase
2. all urls are written as < URL >
3. all dates are written as < DATE >
4. all emails are written as < EMAIL >
5. all numbers are written as < NUM >
6. all unimportant symbols are removed

Converting all letters to lowercase makes it easier to compare different words. Point 2-5 are useful because it makes it possible to count the number of urls, dates, emails and numbers. Also, removing these makes sure that they are not treated as words. Removing unimportant symbols makes sure that these are not treated as words. 

Tokenization makes processing of the data easier, as it eliminates blank spaces and punctuations etc, making the text more homogeneous. In the tokenization process, we, for example, made all the data lower-case, thus not having two different results when processing 'Hello' and 'hello'.

Removing stopwords is useful because these words do not help giving meaning to the documents, in other words they are noise.

Stemming the data is useful because it makes sure that different variants of the same word is converted into the rood of the word. This way it is possible to make sure that two different words (same word with different endings) are understood the same way, because they actually have the exact same meaning.

Implementing task 2 we have used the Pandas library, nltk library and re library. The Pandas library has just been used to read the data from the 'news_sample.csv' file. word_tokenize is a sublibrary of nltk that has some useful functions for tokenizing. stopwords is a sublibrary of nltk.corpus that has some useful functions for removing stopwords. porter is a sublibrary of nltk.stem that has some useful functions for stemming data. These three sublibraries are useful because you do not need to create your own complex functions to tokenize, remove stopwords and stem the data. We have not used the clean_text library because we it did not have all the functionality needed for the task. 

In [307]:
datasample = pd.read_csv('news_sample.csv')
data = pd.read_csv('1mio-raw.csv/1mio-raw.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [308]:
def cleantext(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'<|>', "", text)
    text = re.sub(r'(https?:\/\/)?w{0,3}\.?[a-z]+\.[a-z]\w*[\w\/-]*', "<URL>", text)
    text = re.sub(r'(jan\.?(uary)?|feb\.?(uary)?|mar\.?(ch)?|apr\.?(il)?|may|jun\.(e)?|jul\.(y)?|aug\.?(ust)?|sep\.?(tember)?|oct\.?(ober)?|nov\.?(ember)?|dec\.?(ember)?|monday|tuesday|wednesday|thursday|friday|saturday|sunday) (the )?\d{1,2}((th)?,?( \d{4})?)?', "<DATE>", text)
    text = re.sub(r'\w+@\w+\.[a-zA-Z]{2,3}', "<EMAIL>", text)
    text = re.sub(r'[0-9]+', "<NUM>", text)
    text = re.sub(r'(\\n)+|\s{2,}|(\\t+)', " ", text)
    text = re.sub(r'\.|,|\\|-|\?|\(|\)|\||&|"|”|“|:|!|\+|-|–|—|\/|\$|%|€|#|;|\[|\]|©|®|…|=', "", text)
    return text

def cleanMetaKeywords(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'<|>', "", text)
    text = re.sub(r'(\\n)+|\s{2,}|(\\t+)', " ", text)
    text = re.sub(r'\.|\\|-|\?|\(|\)|\||&|"|”|“|:|!|\+|-|\'|–|—|\/|\$|%|€|#|;|\[|\]|©|®|…|=', "", text)
    return text


Below is Three part job of 
1) Cleaning data
2) tokenization 
3) Stemming

The processed data will be saved in the 'Keywords' column, such that the original data can be untouched in 'content'


michael hayden, sthole countries, daca, haiti, el salvador, africa
Homeland Security, Trump Administration, Immigration, Media Bias, ISIS/Islamic State, Gun Rights, Donald Trump, Russia


In [309]:
#Setup
stop_words = stopwords.words('english')
stemmer = porter.PorterStemmer()

data["content"] = [cleantext(i) for i in data["content"]] #Clean
clean_data_tokens = [word_tokenize(i) for i in data["content"]] #Creating tokens
data["keywords"] = [item for item in clean_data_tokens if item not in stop_words] #Stopword removal
data["keywords"] = data["keywords"].apply(lambda x: set([stemmer.stem(y) for y in x])) #Stem

#data["meta_keywords"] = data["meta_keywords"].apply(lambda x: [cleanMetaKeywords(i) for i in x])
data["meta_keywords"] =[cleanMetaKeywords(i) for i in data["meta_keywords"]]

#print((data["meta_keywords"][247]))
#print((data["tags"][247]))

In [296]:
#Updating tags for explosion
#Updating Authors for explosion 

#Convert each row in column 'tags' to str (from float???)
data["tags"] = [(str(i)).split(",") for i in data["tags"]] 
data["authors"] = [(str(i)).split(",") for i in data["authors"]]
data["meta_keywords"] =[(str(i)).split(",") for i in data["meta_keywords"]]


0                                                     []
1                                                     []
2                                                     []
3                                                     []
4                                                     []
                             ...                        
245                                                   []
246                                                   []
247    [michael hayden,  sthole countries,  daca,  ha...
248    [antonio sabato jr,  oprah winfrey,  president...
249    [bill clinton,  myanmar,  calls,  release,  re...
Name: meta_keywords, Length: 250, dtype: object

In [305]:
#Relation tables
article_tags_relation_table = data[['id','tags']].explode('tags')
owns_relation_table = data[['id', 'domain']]
authors_of_article_table = data[['id','authors']].explode('authors')
article_keywords_relation_table = data[['id','keywords']].explode('keywords')
meta_article_keywords_relation_table  = data[['id','meta_keywords']].explode('meta_keywords')


In [304]:
#Entity tables
articles_table = data[['id','content','type','url','scraped_at','inserted_at','updated_at','meta_description','title']]
keywords_table = pd.DataFrame(set(data['keywords'].explode('keywords')))
meta_keywords_table = pd.DataFrame(set(data['meta_keywords'].explode('meta_keywords')))
tags_table= pd.DataFrame(set(data['tags'].explode('tags')))
domain_table = pd.DataFrame(set(data['domain']))
authors_table = pd.DataFrame(set(data['authors'].explode('authors').explode('authors')))



In [306]:
#entities to CSV
articles_table.to_csv("SQLtables/articles_table.csv",index=False,header=False)
keywords_table.to_csv("SQLtables/keywords_table.csv",index=False,header=False)
meta_keywords_table.to_csv("SQLtables/meta_keywords_table.csv",index=False,header=False)
tags_table.to_csv("SQLtables/tags_table.csv",index=False,header=False)
authors_table.to_csv("SQLtables/authors_table.csv",index=False,header=False)
domain_table.to_csv("SQLtables/domain_table.csv",index=False,header=False)
#relations to CSV
owns_relation_table.to_csv("SQLtables/owns_table.csv",index=False,header=False)
authors_of_article_table.to_csv("SQLtables/authors_of_article.csv",index=False,header=False)
article_tags_relation_table.to_csv("SQLtables/article_tags_relation_table.csv",index=False,header=False)
article_keywords_relation_table.to_csv("SQLtables/article_keywords_relation_table.csv",index=False,header=False)
meta_article_keywords_relation_table.to_csv("SQLtables/meta_article_keywords_relation_table.csv",index=False,header=False)


In [66]:
conn = pc.connect("dbname=datascience user=lk password=l")
cur = conn.cursor()

In [67]:
##
#cur.execute("delete from article")
#cur.execute("delete from authors")
#cur.execute("delete from authors_of")
#cur.execute("delete from domain")
#cur.execute("delete from owns")

In [163]:
cur.execute("copy authors_of from '/home/lk/Desktop/datalogi/DataScience/git/DataScienceRep01/SQLtables/authors_of_article.csv' with (format csv)")
cur.execute("Select * from keywords")


In [None]:

#create Stemmed StopWorded vocab
#stsw_vocab = set(stemmed_tokens)
#stsw_vocab.remove('<')
#stsw_vocab.remove('>')
#print(stsw_vocab)

## Task 3

non-trivial observation 1: How many percent of articles with the word "trump" in it is fake news?

In [None]:
tokenized_articles = [word_tokenize(i) for i in clean_data]
articles_vocabulary = [set(i) for i in tokenized_articles]
trump_included = [i for i in range(len(articles_vocabulary)) if "trump" in articles_vocabulary[i]]
trump_fake_news = 0

for i in range(len(trump_included)):
    if data['type'][i] == "fake":
        trump_fake_news += 1

print(int(trump_fake_news*100/len(trump_included)),"% of articles where the name 'trump' is present, is a fake news article")

non-trivial observation 2: Is the number of articles spread out tolerably evenly between the domains?

non-trivial observation 3: Is there a link between which domain an article comes from and if it is fake news?

In [None]:
tokenized_articles = [word_tokenize(i) for i in clean_data]
articles_vocabulary = [set(i) for i in tokenized_articles]
#Missing author corellation

domainList = data['domain']
TypeList = data['type']
domains = set(domainList)
fakeDomainScore = np.zeros(len(domains))
totalDomainScore = np.zeros(len(domains)) 
for i in range (len(domainList)):
    if (data['type'][i] == 'fake'):
        index = 0 
        for domain in domains:
            if  data['domain'][i] == domain:
                fakeDomainScore[index] += 1
            index+=1
    index = 0 
    for domain in domains:
        if  data['domain'][i] == domain:
            totalDomainScore[index] += 1
        index+=1
print("Each of the 29 domains present in the corpus has the following amount of articles in the corpus:\n", totalDomainScore)
print("\nEach of the 29 domains present in the corpus has the following amount of fake news articles:\n", fakeDomainScore)
print("\nThis means that Beforeitsnews.com has", int(totalDomainScore[np.where(fakeDomainScore == 155)]),"of the articles in the corpus and", int(fakeDomainScore[np.where(fakeDomainScore == 155)]*100/sum(totalDomainScore)), "% of all articles. Thus, the number of articles in the corpus are very unevenly spreed between the domains")
print("\nAlso,", int(fakeDomainScore[np.where(fakeDomainScore == 155)]*100/totalDomainScore[np.where(fakeDomainScore == 155)]), "% of Beforeitsnews.com's articles are fake news and no other domain has fake news in its articles. Thus, there is a link between which domain an article comes from and if it is fake news (The link is probably a little to big)")

non-trivial observation 4: How many articles have missing author value? 

non-trivial observation 5: How much does missing author increase the likelihood that an article is fake news? 

In [None]:
authors = [i for i in data["authors"]]
no_author_counter = 0
no_author_fake_news = 0
no_author_total = 0
author_fake_news = 0
author_total = 0

for i in range(len(authors)):
    if not type(authors[i]) == str:
        no_author_counter += 1
        if data["type"][i] == "fake":
            no_author_fake_news += 1
        no_author_total += 1
    elif data["type"][i] == "fake":
        author_fake_news += 1
        author_total += 1
    else: 
        author_total += 1

print(int(no_author_counter*100/len(authors)), "% of the articles does not have an author")
print(int(no_author_fake_news*100/no_author_total),'% of the no-author articles are fake news')
print(int(author_fake_news*100/author_total),'% of the articles are fake news')
print('Thus we see, that having no author on an article only adds two percent points to the likelihood of it being fake')

## Task 4

We have the following 'article start letters'

In [None]:
group_nr = 1
article_start_letters = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
print(article_start_letters)

Global variables:

In [None]:
stop_searching = 0
next_page = 'https://en.wikinews.org/w/index.php?title=Category:Politics_and_conflicts'
article_links = []
first_letter_between_B_K = []
count = 0

continue_iterations = re.compile(r"pagefrom=[A-K]")

In [None]:
first_iteration = True

while stop_searching == 0:
    response = requests.get(next_page)
    contents = response.content
    soup = BeautifulSoup(contents, 'html.parser')
    articles = soup.find(id="mw-pages")
    
    links = [link.get("href") for link in articles.find_all('a')]
    
    if first_iteration:
        first_letter_between_B_K = continue_iterations.findall(links[0])
        first_iteration = False
        next_page = 'https://en.wikinews.org'+links[0]
        article_links += ['https://en.wikinews.org' + group_link for group_link in links[1:] if group_link[6] in article_start_letters]
    else:
        first_letter_between_B_K = continue_iterations.findall(links[1])
        next_page = 'https://en.wikinews.org'+links[1]
        article_links += ['https://en.wikinews.org' + group_link for group_link in links[2:] if group_link[6] in article_start_letters]
    
    if len(first_letter_between_B_K) == 0:
        stop_searching = 1
    
    first_letter_between_B_K = []

The following box takes about 20 minutes to run...

In [None]:
article_source_code = [BeautifulSoup(requests.get(article).content, 'html.parser') for article in article_links]

In [None]:
article_titles = [article.find('h1').get_text() for article in article_source_code]
article_release_date = [str(article.find(id="publishDate"))[50:60] for article in article_source_code]
article_sources = [", ".join([element.get('href') for element in ((article.find('ul')).find_all('a', rel = 'nofollow', class_ ='external text'))]) for article in article_source_code]

In [None]:
pd_articles = pd.DataFrame()
pd_articles['Titles'] = article_titles
pd_articles['Release_Date'] = article_release_date
pd_articles['Sources']= article_sources

pd_articles