In [5]:
import pandas as pd
import numpy as np
import re
import math
import requests
from bs4 import BeautifulSoup

In [6]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import porter
nltk.download('punkt')
nltk.download('stopwords')
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /home/lk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Task 1

Group members: Angelina Näsström (nzv947), Daniel Stephensen (fbp131), Kristina Wilke (mlt790), Lauritz Koch (hdg618)

## Task 2

We have used the following procedures: cleaning, tokenizing, removing stopwords and stemming the data. When cleaning the data we made sure of the following: 
1. all letters are in lowercase
2. all urls are written as < URL >
3. all dates are written as < DATE >
4. all emails are written as < EMAIL >
5. all numbers are written as < NUM >
6. all unimportant symbols are removed

Converting all letters to lowercase makes it easier to compare different words. Point 2-5 are useful because it makes it possible to count the number of urls, dates, emails and numbers. Also, removing these makes sure that they are not treated as words. Removing unimportant symbols makes sure that these are not treated as words. 

Tokenization makes processing of the data easier, as it eliminates blank spaces and punctuations etc, making the text more homogeneous. In the tokenization process, we, for example, made all the data lower-case, thus not having two different results when processing 'Hello' and 'hello'.

Removing stopwords is useful because these words do not help giving meaning to the documents, in other words they are noise.

Stemming the data is useful because it makes sure that different variants of the same word is converted into the rood of the word. This way it is possible to make sure that two different words (same word with different endings) are understood the same way, because they actually have the exact same meaning.

Implementing task 2 we have used the Pandas library, nltk library and re library. The Pandas library has just been used to read the data from the 'news_sample.csv' file. word_tokenize is a sublibrary of nltk that has some useful functions for tokenizing. stopwords is a sublibrary of nltk.corpus that has some useful functions for removing stopwords. porter is a sublibrary of nltk.stem that has some useful functions for stemming data. These three sublibraries are useful because you do not need to create your own complex functions to tokenize, remove stopwords and stem the data. We have not used the clean_text library because we it did not have all the functionality needed for the task. 

In [7]:
data = pd.read_csv('news_sample.csv')

In [8]:
def cleantext(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'<|>', "", text)
    text = re.sub(r'(https?:\/\/)?w{0,3}\.?[a-z]+\.[a-z]\w*[\w\/-]*', "<URL>", text)
    text = re.sub(r'(jan\.?(uary)?|feb\.?(uary)?|mar\.?(ch)?|apr\.?(il)?|may|jun\.(e)?|jul\.(y)?|aug\.?(ust)?|sep\.?(tember)?|oct\.?(ober)?|nov\.?(ember)?|dec\.?(ember)?|monday|tuesday|wednesday|thursday|friday|saturday|sunday) (the )?\d{1,2}((th)?,?( \d{4})?)?', "<DATE>", text)
    text = re.sub(r'\w+@\w+\.[a-zA-Z]{2,3}', "<EMAIL>", text)
    text = re.sub(r'[0-9]+', "<NUM>", text)
    text = re.sub(r'(\\n)+|\s{2,}|(\\t+)', " ", text)
    text = re.sub(r'\.|,|\\|-|\?|\(|\)|\||&|"|”|“|:|!|\+|-|–|—|\/|\$|%|€|#|;|\[|\]|©|®|…|=', "", text)
    return text

clean_data = [cleantext(i) for i in data["content"]]
string_of_contents = " ".join(clean_data)
#print(string_of_contents)

In [9]:
#Below should be replaced with actual data from after Cleaning
cleaned_data_SAMPLE_FOR_CODING_PURPOSES = string_of_contents

#Create tokens based on clean_data. cl = clean, da = data
clda_tokens = word_tokenize(cleaned_data_SAMPLE_FOR_CODING_PURPOSES)

#print((clda_tokens))

In [10]:
#StopWordsCLeanDAta_tokens
stop_words = stopwords.words('english')
swclda_tokens = [word for word in clda_tokens if not word in stop_words]

In [11]:
stemmer = porter.PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in swclda_tokens]

#create Stemmed StopWorded vocab
stsw_vocab = set(stemmed_tokens)
stsw_vocab.remove('<')
stsw_vocab.remove('>')
#print(stsw_vocab)

## Task 3

non-trivial observation 1: How many percent of articles with the word "trump" in it is fake news?

In [12]:
tokenized_articles = [word_tokenize(i) for i in clean_data]
articles_vocabulary = [set(i) for i in tokenized_articles]
trump_included = [i for i in range(len(articles_vocabulary)) if "trump" in articles_vocabulary[i]]
trump_fake_news = 0

for i in range(len(trump_included)):
    if data['type'][i] == "fake":
        trump_fake_news += 1

print(int(trump_fake_news*100/len(trump_included)),"% of articles where the name 'trump' is present, is a fake news article")

61 % of articles where the name 'trump' is present, is a fake news article


non-trivial observation 2: Is the number of articles spread out tolerably evenly between the domains?

non-trivial observation 3: Is there a link between which domain an article comes from and if it is fake news?

In [13]:
tokenized_articles = [word_tokenize(i) for i in clean_data]
articles_vocabulary = [set(i) for i in tokenized_articles]
#Missing author corellation

domainList = data['domain']
TypeList = data['type']
domains = set(domainList)
fakeDomainScore = np.zeros(len(domains))
totalDomainScore = np.zeros(len(domains)) 
for i in range (len(domainList)):
    if (data['type'][i] == 'fake'):
        index = 0 
        for domain in domains:
            if  data['domain'][i] == domain:
                fakeDomainScore[index] += 1
            index+=1
    index = 0 
    for domain in domains:
        if  data['domain'][i] == domain:
            totalDomainScore[index] += 1
        index+=1
print("Each of the 29 domains present in the corpus has the following amount of articles in the corpus:\n", totalDomainScore)
print("\nEach of the 29 domains present in the corpus has the following amount of fake news articles:\n", fakeDomainScore)
print("\nThis means that Beforeitsnews.com has", int(totalDomainScore[np.where(fakeDomainScore == 155)]),"of the articles in the corpus and", int(fakeDomainScore[np.where(fakeDomainScore == 155)]*100/sum(totalDomainScore)), "% of all articles. Thus, the number of articles in the corpus are very unevenly spreed between the domains")
print("\nAlso,", int(fakeDomainScore[np.where(fakeDomainScore == 155)]*100/totalDomainScore[np.where(fakeDomainScore == 155)]), "% of Beforeitsnews.com's articles are fake news and no other domain has fake news in its articles. Thus, there is a link between which domain an article comes from and if it is fake news (The link is probably a little to big)")

Each of the 29 domains present in the corpus has the following amount of articles in the corpus:
 [  1.   1.   2.   2.   2.   6.   1.   1.   1.   1.   2.   1.   1.   1.
   4.   1.   4.   7.  17.   3.   5.  24.   2. 155.   1.   1.   1.   1.
   1.]

Each of the 29 domains present in the corpus has the following amount of fake news articles:
 [  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0. 155.   0.   0.   0.   0.
   0.]

This means that Beforeitsnews.com has 155 of the articles in the corpus and 62 % of all articles. Thus, the number of articles in the corpus are very unevenly spreed between the domains

Also, 100 % of Beforeitsnews.com's articles are fake news and no other domain has fake news in its articles. Thus, there is a link between which domain an article comes from and if it is fake news (The link is probably a little to big)


non-trivial observation 4: How many articles have missing author value? 

non-trivial observation 5: How much does missing author increase the likelihood that an article is fake news? 

In [14]:
authors = [i for i in data["authors"]]
no_author_counter = 0
no_author_fake_news = 0
no_author_total = 0
author_fake_news = 0
author_total = 0

for i in range(len(authors)):
    if not type(authors[i]) == str:
        no_author_counter += 1
        if data["type"][i] == "fake":
            no_author_fake_news += 1
        no_author_total += 1
    elif data["type"][i] == "fake":
        author_fake_news += 1
        author_total += 1
    else: 
        author_total += 1

print(int(no_author_counter*100/len(authors)), "% of the articles does not have an author")
print(int(no_author_fake_news*100/no_author_total),'% of the no-author articles are fake news')
print(int(author_fake_news*100/author_total),'% of the articles are fake news')
print('Thus we see, that having no author on an article only adds two percent points to the likelihood of it being fake')

32 % of the articles does not have an author
63 % of the no-author articles are fake news
61 % of the articles are fake news
Thus we see, that having no author on an article only adds two percent points to the likelihood of it being fake


In [15]:
response = requests.get('https://en.wikinews.org/wiki/Category:Politics_and_conflicts')
contents = response.text
print(contents)

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Category:Politics and conflicts - Wikinews, the free news source</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YEDiqxShFTi77Cg@rSFDqAAAAEg","wgCSPNonce":!1,"wgCanonicalNamespace":"Category","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":14,"wgPageName":"Category:Politics_and_conflicts","wgTitle":"Politics and conflicts","wgCurRevisionId":4225725,"wgRevisionId":4225725,"wgArticleId":1847,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Topic cats with suppressed portal link","Topic cats with existing portal","Topic cats with offset parameter","

In [16]:
soup = BeautifulSoup(contents, 'html.parser')
top_stories = soup.find(id="mw-pages")
top_stories_articles = top_stories.find_all(class_ = 'mw-category')
print (top_stories_articles)

[<div class="mw-category"><div class="mw-category-group"><h3>"</h3>
<ul><li><a href="/wiki/%22Avast_ye_scurvy_file_sharers!%22:_Interview_with_Swedish_Pirate_Party_leader_Rickard_Falkvinge" title='"Avast ye scurvy file sharers!": Interview with Swedish Pirate Party leader Rickard Falkvinge'>"Avast ye scurvy file sharers!": Interview with Swedish Pirate Party leader Rickard Falkvinge</a></li>
<li><a href="/wiki/%22Bigoted_woman%22:_controversial_Gordon_Brown_remarks_caught_on_air" title='"Bigoted woman": controversial Gordon Brown remarks caught on air'>"Bigoted woman": controversial Gordon Brown remarks caught on air</a></li>
<li><a href="/wiki/%22Civil_defence%22_thwarts_Israeli_air_strike_on_Gaza_refugee_camp" title='"Civil defence" thwarts Israeli air strike on Gaza refugee camp'>"Civil defence" thwarts Israeli air strike on Gaza refugee camp</a></li>
<li><a href="/wiki/%22Corrupt%22_New_Zealand_government_to_repay_$768,000" title='"Corrupt" New Zealand government to repay $768,000'

In [17]:
group_nr = 1
article_start_letters = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
print(article_start_letters)

BCDEFGHIJK


In [18]:
stop_searching = 0
next_page = 'https://en.wikinews.org/wiki/Category:Politics_and_conflicts'
article_links = []
while stop_searching == 0:
    response = requests.get(next_page)
    contents = response.text
    
    soup = BeautifulSoup(contents, 'html.parser')
    categories = soup.find(id="mw-pages")
    all_categories = categories.find_all(class_ = 'mw-category-group')
    
    title_regex = re.compile('<a.*?title=[BCDEFGHIJK].*?<\/a>')
    group_1_categories = title_regex.findall(str(all_categories))

    link_regex = re.compile('\/wiki[^"]*')
    group1_category_links = ["".join(["https://en.wikinews.org/",link_regex.findall(i)[0]]) for i in group_1_categories]
    
    next_page_regex = re.compile('/w/index.php?title=Category:Politics_and_conflicts&(pagefrom|pageuntil)=[A-K].*?#mw-pages')
    link_ending = next_page_regex.findall(str(categories))
    
    if len(link_ending) == 0:
        stop_searching = 1
    else:
        next_page = "".join("https://en.wikinews.org/", link_ending)
print (article_links)

[]


In [90]:
#regexes to find 'next page' link
link_regex_raw = re.compile(r"<a href=\"/w/index.*?next page?</a>")#Locate the link to the next page
link_regex_clean= re.compile(r"\/w\/index.*?#mw-pages")#From above location, clean link such that we can use it in request.get
actual_link_head = "https://en.wikinews.org" 

#while(True):
#    link = 
#

response = requests.get('https://en.wikinews.org/w/index.php?title=Category:Politics_and_conflicts&amp%3Bpagefrom=Afghan+protestors+shot+after+mosque+raid+sparks+anger&pagefrom=British+newspaper+makes+House+of+Lords+bribery+claims#mw-pages')
contents = response.text
soup = BeautifulSoup(contents, 'html.parser')
stories = soup.find(id="mw-pages")
dynamic_link_raw = link_regex_raw.findall(str(stories))
dynamic_link_clean = link_regex_clean.findall(str(dynamic_link_raw[0]))
str_link_clean = dynamic_link_clean[0]
url = actual_link_head + str_link_clean
print (url)

for i in range(3):
    print(url)
    response = requests.get(url)#First time we open the next page
    contents = response.text
   # print(response.text)
    soup = BeautifulSoup(contents,'html.parser') 
    stories = soup.find(id="mw-pages") 
   # print("\n\n"+str(stories))
    dynamic_link_raw = link_regex_raw.findall(str(stories))
   # print(dynamic_link_raw)
    dynamic_link_clean = link_regex_clean.findall(str(dynamic_link_raw[0]))
    str_link_clean = dynamic_link_clean[0]
    url = actual_link_head + str_link_clean
    #print (url)
#stories = 
#print(dynamic_link_raw[0])
#print(dynamic_link_clean[0][0:len(dynamic_link_clean)-len("#mw-pages")-1])#len(dynamic_link_clean)-8])
#print(str_link_clean)
#print(actual_link)
#print(stories)





https://en.wikinews.org/w/index.php?title=Category:Politics_and_conflicts&amp;amp%3Bpagefrom=Afghan+protestors+shot+after+mosque+raid+sparks+anger&amp;pageuntil=British+newspaper+makes+House+of+Lords+bribery+claims#mw-pages
https://en.wikinews.org/w/index.php?title=Category:Politics_and_conflicts&amp;amp%3Bpagefrom=Afghan+protestors+shot+after+mosque+raid+sparks+anger&amp;pageuntil=British+newspaper+makes+House+of+Lords+bribery+claims#mw-pages
https://en.wikinews.org/w/index.php?title=Category:Politics_and_conflicts&amp;amp%3Bamp%3Bpagefrom=Afghan+protestors+shot+after+mosque+raid+sparks+anger&amp;amp%3Bpageuntil=British+newspaper+makes+House+of+Lords+bribery+claims&amp;pagefrom=Afghan+protestors+shot+after+mosque+raid+sparks+anger#mw-pages
https://en.wikinews.org/w/index.php?title=Category:Politics_and_conflicts&amp;amp%3Bamp%3Bamp%3Bpagefrom=Afghan+protestors+shot+after+mosque+raid+sparks+anger&amp;amp%3Bamp%3Bpageuntil=British+newspaper+makes+House+of+Lords+bribery+claims&amp;amp%3B