In [33]:
import pandas as pd
import numpy as np
import re
import math
import requests
from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import porter
nltk.download('punkt')
nltk.download('stopwords')
import matplotlib.pyplot as plt
import psycopg2 as pc
from IPython import display
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import csv
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielstephensen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielstephensen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importing articles

In [30]:
#Reading the data from 1mio-raw.csv
dataTotal = pd.read_csv('250t-raw.csv')

#We will only analyse a smaller part of the data set
data = dataTotal[0:1000]

In [2]:
SQL_database_login = "dbname=datascience user=postgres password=****"
SQLtables_path = "/Users/krist/Desktop/Uni/milestone/DataScienceRep01/SQLtables/"

## Function Definitions

In [26]:
#cleantext cleans the input string with the following functions: Characters are set to lowercase, 
#urls are substituted with <URL>, dates are substitured with <DATE>, emails are substitured with <EMAIL>
#numbers are substitured with <NUM>, newlines and non-letter characters are removed.
def cleantext(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'<|>', "", text)
    text = re.sub(r'(https?:\/\/)?w{0,3}\.?[a-z]+\.[a-z]\w*[\w\/-]*', "<URL>", text)
    text = re.sub(r'(jan\.?(uary)?|feb\.?(uary)?|mar\.?(ch)?|apr\.?(il)?|may|jun\.(e)?|jul\.(y)?|aug\.?(ust)?|sep\.?(tember)?|oct\.?(ober)?|nov\.?(ember)?|dec\.?(ember)?|monday|tuesday|wednesday|thursday|friday|saturday|sunday) (the )?\d{1,2}((th)?,?( \d{4})?)?', "<DATE>", text)
    text = re.sub(r'\w+@\w+\.[a-zA-Z]{2,3}', "<EMAIL>", text)
    text = re.sub(r'[0-9]+', "<NUM>", text)
    text = re.sub(r'(\\n)+|\s{2,}|(\\t+)', " ", text)
    text = re.sub(r'\.|,|\\|-|\?|\(|\)|\||&|"|”|“|:|!|\+|-|–|—|\/|\$|%|€|#|;|\[|\]|©|®|…|=', "", text)
    return text

#cleanMetaKeywords cleans the input string with the following functions: 
#Characters are set to lowercase, newlines and non-letter characters are removed.
def cleanMetaKeywords(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'(\\n)+|\s{2,}|(\\t+)', " ", text)
    text = re.sub(r'\.|\\|-|\?|\(|\)|\||&|"|”|“|:|!|\+|-|\'|–|—|\/|\$|%|€|#|;|\[|\]|©|®|…|=|<|>', "", text)
    return text

def tokenize(text):
    return word_tokenize(text)

def stopword(word_list):
    stop_words = stopwords.words('english')
    
    return [word for word in word_list if word not in stop_words]

def stemming(word_list):
    stemmer = porter.PorterStemmer()

    return [stemmer.stem(word) for word in word_list]

def getSoup(url):
    response = requests.get(url)
    contents = response.content
    return BeautifulSoup(contents, 'html.parser')

def executeSQL(filename, cur):
    fd = open(filename, 'r')
    sqlFile = fd.read()
    fd.close()
    sqlCommands = sqlFile.split(';')
    for command in sqlCommands:
            cur.execute(command)



## Scraping data from Politics and Conflict

In [4]:
group_nr = 1
article_start_letters = "ABCDEFGHIJKLMNOPRSTUVWZABCDEFGHIJKLMNOPRSTUVWZ"[group_nr%23:group_nr%23+10]
print(article_start_letters)

BCDEFGHIJK


In [5]:
#The algorithm stops finding new articles when 'stop_searching' is set to True
stop_searching = False

#Finding the nextpage link in the first iteration is a little different, and therefore this value is needed
first_iteration = True

#The root url is the domain of wikinews
root_link = 'https://en.wikinews.org'

#next_page is the webpage that the algorithm searches for articles in next iteration of the while-loop
next_page = root_link + '/w/index.php?title=Category:Politics_and_conflicts'

#The links to the articles starting with the 'article_start_letters' are appended to 'article links'
article_links = []

#For each iteration this list gets some values if the first letter 
#of the first article in the next webpage is between A and K
first_letter_between_B_K = []

#A regex used for 'first_letter_between_B_K'
continue_iterations = re.compile(r"pagefrom=[A-K]")

In [6]:
while not(stop_searching):
    soup = getSoup(next_page)
    articles = soup.find(id="mw-pages")
    
    links = [link.get("href") for link in articles.find_all('a')]
    
    if first_iteration:
        first_letter_between_B_K = continue_iterations.findall(links[0])
        first_iteration = False
        next_page = root_link + links[0]
        article_links += [root_link + group_link for group_link in links[1:] if group_link[6] in article_start_letters]
    else:
        first_letter_between_B_K = continue_iterations.findall(links[1])
        next_page = root_link + links[1]
        article_links += [root_link + group_link for group_link in links[2:] if group_link[6] in article_start_letters]
    
    if len(first_letter_between_B_K) == 0:
        stop_searching = True
    
    first_letter_between_B_K = []

In [16]:
article_source_code = [getSoup(article) for article in article_links]

In [22]:
#These id's has to be different from the other articles
article_id = range(len(data),len(data)+len(article_links))
article_titles = [article.find('h1').get_text() for article in article_source_code]
article_release_date = [str(article.find(id="publishDate"))[50:60] for article in article_source_code]
article_urls = article_links
article_content = [" ".join([p.get_text() for p in (article.find(id="mw-content-text")).find_all('p')]) for article in article_source_code]

In [26]:
scraped_articles = pd.DataFrame()

scraped_articles['id'] = article_id
scraped_articles['content'] = [cleantext(content) for content in article_content]
scraped_articles['title'] = article_titles
scraped_articles['release_date'] = article_release_date
scraped_articles['url'] = article_urls

scraped_articles

Unnamed: 0,id,content,title,release_date,url
0,0,i agree with brs that categorypolitical activi...,Category talk:Activists,,https://en.wikinews.org/wiki/Category_talk:Act...
1,1,<DATE> in british columbia canada leadership d...,B.C. elections debate fiery but not conclusive,,https://en.wikinews.org/wiki/B.C._elections_de...
2,2,wednesday <DATE> a suicide car bomb exploded y...,"Baghdad bombing kills several people, scores i...",2010-01-27,https://en.wikinews.org/wiki/Baghdad_bombing_k...
3,3,sunday <DATE> a judge in baghdad iraq has clea...,Baghdad judge clears pair of murdering six for...,2010-10-10,https://en.wikinews.org/wiki/Baghdad_judge_cle...
4,4,thursday <DATE> the bodies of over <NUM><NUM> ...,"Baghdad morgue received over 1,000 bodies in July",2005-08-18,https://en.wikinews.org/wiki/Baghdad_morgue_re...
...,...,...,...,...,...
2855,2855,wednesday <DATE> kyrgyz authorities declared <...,"Kyrgyz government declares elections valid, re...",2005-03-23,https://en.wikinews.org/wiki/Kyrgyz_government...
2856,2856,tuesday <DATE> thousands of protesters seized ...,Kyrgyz president orders election probe as prot...,2005-03-22,https://en.wikinews.org/wiki/Kyrgyz_president_...
2857,2857,sunday june <NUM> <NUM> kyrgyzstani citizens o...,Kyrgyzstan votes on referendum for new constit...,2010-06-27,https://en.wikinews.org/wiki/Kyrgyzstan_votes_...
2858,2858,saturday june <NUM> <NUM> a second day of ethn...,"Kyrgyzstan: Ethnic unrest continues, governmen...",2010-06-12,https://en.wikinews.org/wiki/Kyrgyzstan:_Ethni...


In [None]:
scraped_articles.to_csv("SQLtables/scraped_articles.csv",index=False,header=False)

In [None]:
#Setting up a connection with the SQL server. Make sure that you write your own dbname, user and password as input
conn = pc.connect(SQL_database_login)
cur = conn.cursor()

In [None]:
executeSQL('SQLfiles/createTableScraped.sql', cur)

## Establish a baseline

In [40]:
print(type(data["title"][1]))

<class 'str'>


In [31]:
#Remove duplicate articles
data = data.drop_duplicates(subset="content")


In [34]:
#Cleaning the content
cleaned_content = [cleantext(article_content) for article_content in data['content']]

#Tokenizing the cleaned data
tokens = [tokenize(clean_text) for clean_text in cleaned_content]

#Removing stopwords
stopwords = [stopword(token_list) for token_list in tokens]

#Stemming the data (this is used for the 'keywords' attribute)
stemmed_data = [stemming(stopword_list) for stopword_list in stopwords]

#Cleaning meta keywords
clean_meta_keywords = [cleanMetaKeywords(metakeyword) for metakeyword in data["meta_keywords"]]

In [35]:
#Making sure that each element of 'tags', 'authors' and 'meta_keywords' are stripped stings and converting them to arrays
data["tags"] = [[tag.strip() for tag in (str(i)).split(",")] for i in data["tags"]]
data["authors"] = [[author.strip() for author in (str(i)).split(",")] for i in data["authors"]]
data["meta_keywords"] = [[meta_keyword.strip() for meta_keyword in (str(i)).split(",")] for i in clean_meta_keywords]
data["content"] = cleaned_content
data["id"] = range(0,len(data))

In [112]:
#Entity tables
articles = data[['id','content','url','meta_description','title']]

meta_keywords = pd.DataFrame(set(data[['meta_keywords']].explode('meta_keywords')))
meta_keywords = meta_keywords.rename(columns={0: 'meta_keywords'})
meta_keywords["ide"] = range(0,len(meta_keywords))
article_meta_keywords = pd.merge(meta_keywords, data[['id','meta_keywords']].explode('meta_keywords'), on = "meta_keywords")[['id','ide']]
article_meta_keywords = owns.rename(columns={'id': 'article_id', 'ide': 'meta_keyword_id'})


domains = pd.DataFrame(set(data['domain']))
domains = domains.rename(columns={0: 'domain'})
domains["ide"] = range(0,len(domains))
owns = pd.merge(domains, data, on = "domain")[['id','ide']]
owns = owns.rename(columns={'id': 'article_id', 'ide': 'domain_id'})

authors = pd.DataFrame(set(data[['authors']].explode('authors')))
authors = authors.rename(columns={0: 'authors'})
authors["ide"] = range(0,len(authors))
article_authors = pd.merge(authors, data[['id','authors']].explode('authors'), on = "authors")[['id','ide']]
article_authors = owns.rename(columns={'id': 'article_id', 'ide': 'author_id'})

types = pd.DataFrame(set(data['type']))
types = types.rename(columns={0: 'type'})
types["ide"] = range(0,len(types))
article_types = pd.merge(types, data, on = "type")[['id','ide']]
article_types = owns.rename(columns={'id': 'article_id', 'ide': 'type_id'})

In [113]:
#Entities to CSV
articles.to_csv("SQLtables/articles.csv",index=False,header=False)
meta_keywords.to_csv("SQLtables/meta_keywords.csv",index=False,header=False)
authors.to_csv("SQLtables/authors.csv",index=False,header=False)
domains.to_csv("SQLtables/domains.csv",index=False,header=False)
types.to_csv("SQLtables/types.csv",index=False,header=False)

#Relations to CSV
owns.to_csv("SQLtables/owns.csv",index=False,header=False)
article_authors.to_csv("SQLtables/article_authors.csv",index=False,header=False)
article_meta_keywords.to_csv("SQLtables/article_meta_keywords.csv",index=False,header=False)
article_types.to_csv("SQLtables/article_types.csv",index=False,header=False)

FileNotFoundError: [Errno 2] No such file or directory: 'SQLtables/articles.csv'

In [None]:
executeSQL('SQLfiles/createTables.sql', cur)
executeSQL('SQLfiles/setUpTables.sql', cur)

In [47]:
data_1 = data[~data['type'].isin(['unreliable','junksci','rumor', 'hate'])] #removes these types
data_1 = data_1.dropna(subset = ['type']) #drops where type is nan
data_1.loc[data_1['type'].isin(['fake','satire','bias', 'conspiracy']), 'label'] = 'Fake' #labels types 'fake'
data_1.loc[data_1['type'].isin(['reliable','political','clickbait']), 'label'] = 'True' #labels types 'true'
data_1

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source,label
5,11,5,blackgenocide.org,conspiracy,http://blackgenocide.org/speaking_request.html,speaking engagement request contact person na...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Request Speaking Engagement,[nan],,[],,[nan],,,Fake
6,13,6,blackgenocide.org,conspiracy,http://blackgenocide.org/archived_articles/opp...,why we oppose planned parent hood the followi...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Why We Oppose Planned Parenthood,[nan],,"[planned parenthood, minorities, black culture...",A rationale for opposing the work of Planned P...,[nan],,,Fake
7,14,7,bipartisanreport.com,clickbait,http://bipartisanreport.com/cdn-cgi/l/email-pr...,the website from which you got to this page is...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Email Protection,[nan],,[],,[nan],,,True
9,23,9,blacklistednews.com,clickbait,https://www.blacklistednews.com/Egypt%E2%80%99...,egypt’s presidential campaign has kicked into ...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Egypt’s race for president kicks off with arre...,[nan],,[],"The Best in uncensored news, information, and ...",[nan],,,True
10,26,10,clickhole.com,satire,http://www.clickhole.com/article/serendipity-m...,if you don’t believe in fate here’s a story th...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Serendipity: This Man Made Up An Entire Person...,[nan],,"[wow, love, relationships, coffee, beautiful, ...","If you don’t believe in fate, here’s a story t...",[nan],,,Fake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3883,931,awarenessact.com,conspiracy,http://awarenessact.com/remembering-apollo-1-n...,space exploration is an intriguing and necessa...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Remembering Apollo 1: NASA’s Careless Mistake ...,[Gerald Sinclair],,[],,"[Nasa, apollo one, people, honest, lethal, sad...",,,Fake
996,3887,932,beforeitsnews.com,fake,http://beforeitsnews.com/blogging-citizen-jour...,fisa documents released of readers think this...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,FISA Documents Released,[nan],,[],,[nan],,,Fake
997,3891,933,canadafreepress.com,conspiracy,http://canadafreepress.com/members/1/EliasBejj...,elias bejjani elias bejjani chairman for the ...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Elias Bejjani,"[Because Without America, There Is No Free Wor...",,[],,[nan],,,Fake
998,3898,934,beforeitsnews.com,fake,http://beforeitsnews.com/blogging-citizen-jour...,note i do not necessarily endorse any products...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Fake “Memos” Being Released Into The Info Stre...,[nan],,[],,[nan],,,Fake


In [29]:
vect = TfidfVectorizer()
content_tfidf = vect.fit_transform(scraped_articles['content'])
content_tfidf_df = pd.DataFrame(content_tfidf.todense(),columns = vect.get_feature_names())


In [30]:
X_train, X_test, y_train, y_test = train_test_split(content_tfidf, articles['label'], test_size=0.40, random_state=42)

NameError: name 'train_test_split' is not defined

In [None]:
k_nearest = KNeighborsClassifier(n_neighbors=15, weights='distance')

# Fit the model
k_nearest.fit(X_train,y_train)

# Predict on the test set
k_nearest_pred = k_nearest.predict(X_test)

# Evaluate performance
print("k_nearest accuracy:" + str(accuracy_score(y_test,k_nearest_pred)))
