In [None]:
import pandas as pd
import numpy as np
import re
import math

In [None]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import porter
nltk.download('punkt')
nltk.download('stopwords')
import matplotlib.pyplot as plt

## Task 1

Group members: Angelina Näsström (nzv947), Daniel Stephensen (fbp131), Kristina Wilke (mlt790), Lauritz Koch (hdg618)

## Task 2

We have used the following procedures: cleaning, tokenizing, removing stopwords and stemming the data. When cleaning the data we made sure of the following: 
1. all letters are in lowercase
2. all urls are written as < URL >
3. all dates are written as < DATE >
4. all emails are written as < EMAIL >
5. all numbers are written as < NUM >
6. all unimportant symbols are removed

Converting all letters to lowercase makes it easier to compare different words. Point 2-5 are useful because it makes it possible to count the number of urls, dates, emails and numbers. Also, removing these makes sure that they are not treated as words. Removing unimportant symbols makes sure that these are not treated as words. 

Tokenization makes processing of the data easier, as it eliminates blank spaces and punctuations etc, making the text more homogeneous. In the tokenization process, we, for example, made all the data lower-case, thus not having two different results when processing 'Hello' and 'hello'.

Removing stopwords is useful because these words do not help giving meaning to the documents, in other words they are noise.

Stemming the data is useful because it makes sure that different variants of the same word is converted into the rood of the word. This way it is possible to make sure that two different words (same word with different endings) are understood the same way, because they actually have the exact same meaning.

Implementing task 2 we have used the Pandas library, nltk library and re library. The Pandas library has just been used to read the data from the 'news_sample.csv' file. word_tokenize is a sublibrary of nltk that has some useful functions for tokenizing. stopwords is a sublibrary of nltk.corpus that has some useful functions for removing stopwords. porter is a sublibrary of nltk.stem that has some useful functions for stemming data. These three sublibraries are useful because you do not need to create your own complex functions to tokenize, remove stopwords and stem the data. We have not used the clean_text library because we it did not have all the functionality needed for the task. 

In [None]:
data = pd.read_csv('news_sample.csv')

In [None]:
def cleantext(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'<|>', "", text)
    text = re.sub(r'(https?:\/\/)?w{0,3}\.?[a-z]+\.[a-z]\w*[\w\/-]*', "<URL>", text)
    text = re.sub(r'(jan\.?(uary)?|feb\.?(uary)?|mar\.?(ch)?|apr\.?(il)?|may|jun\.(e)?|jul\.(y)?|aug\.?(ust)?|sep\.?(tember)?|oct\.?(ober)?|nov\.?(ember)?|dec\.?(ember)?|monday|tuesday|wednesday|thursday|friday|saturday|sunday) (the )?\d{1,2}((th)?,?( \d{4})?)?', "<DATE>", text)
    text = re.sub(r'\w+@\w+\.[a-zA-Z]{2,3}', "<EMAIL>", text)
    text = re.sub(r'[0-9]+', "<NUM>", text)
    text = re.sub(r'(\\n)+|\s{2,}|(\\t+)', " ", text)
    text = re.sub(r'\.|,|\\|-|\?|\(|\)|\||&|"|”|“|:|!|\+|-|–|—|\/|\$|%|€|#|;|\[|\]|©|®|…|=', "", text)
    return text

clean_data = [cleantext(i) for i in data["content"]]
string_of_contents = " ".join(clean_data)
print(string_of_contents)

In [None]:
#Below should be replaced with actual data from after Cleaning
cleaned_data_SAMPLE_FOR_CODING_PURPOSES = string_of_contents

#Create tokens based on clean_data. cl = clean, da = data
clda_tokens = word_tokenize(cleaned_data_SAMPLE_FOR_CODING_PURPOSES)

#print((clda_tokens))

In [None]:
#StopWordsCLeanDAta_tokens
stop_words = stopwords.words('english')
swclda_tokens = [word for word in clda_tokens if not word in stop_words]

In [None]:
stemmer = porter.PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in swclda_tokens]

#create Stemmed StopWorded vocab
stsw_vocab = set(stemmed_tokens)
stsw_vocab.remove('<')
stsw_vocab.remove('>')
print(stsw_vocab)

## Task 3

non-trivial observation 1: How many percent of articles with the word "trump" in it is fake news?

In [None]:
tokenized_articles = [word_tokenize(i) for i in clean_data]
articles_vocabulary = [set(i) for i in tokenized_articles]
trump_included = [i for i in range(len(articles_vocabulary)) if "trump" in articles_vocabulary[i]]
trump_fake_news = 0
total_fake_news = 0

for i in range(len(trump_included)):
    if data['type'][i] == "fake":
        trump_fake_news += 1

print(int(trump_fake_news*100/len(trump_included)),"% of articles where the name 'trump' is present, is a fake news article")

non-trivial observation 2: Is the number of articles spread out tolerably evenly between the domains?

non-trivial observation 3: Is there a link between which domain an article comes from and if it is fake news?

In [None]:
tokenized_articles = [word_tokenize(i) for i in clean_data]
articles_vocabulary = [set(i) for i in tokenized_articles]
#Missing author corellation

domainList = data['domain']
TypeList = data['type']
domains = set(domainList)
fakeDomainScore = np.zeros(len(domains))
totalDomainScore = np.zeros(len(domains)) 
for i in range (len(domainList)):
    if (data['type'][i] == 'fake'):
        index = 0 
        for domain in domains:
            if  data['domain'][i] == domain:
                fakeDomainScore[index] += 1
            index+=1
    index = 0 
    for domain in domains:
        if  data['domain'][i] == domain:
            totalDomainScore[index] += 1
        index+=1
print("Each of the 29 domains present in the corpus has the following amount of articles in the corpus:\n", totalDomainScore)
print("\nEach of the 29 domains present in the corpus has the following amount of fake news articles:\n", fakeDomainScore)
print("\nThis means that Beforeitsnews.com has", int(totalDomainScore[np.where(fakeDomainScore == 155)]),"of the articles in the corpus and", int(fakeDomainScore[np.where(fakeDomainScore == 155)]*100/sum(totalDomainScore)), "% of all articles. Thus, the number of articles in the corpus are very unevenly spreed between the domains")
print("\nAlso,", int(fakeDomainScore[np.where(fakeDomainScore == 155)]*100/totalDomainScore[np.where(fakeDomainScore == 155)]), "% of Beforeitsnews.com's articles are fake news and no other domain has fake news in its articles. Thus, there is a link between which domain an article comes from and if it is fake news (The link is probably a little to big)")

non-trivial observation 4: 

In [None]:
authors = [i for i in data["authors"]]
no_author_counter = 0
no_author_fake_news = 0
no_author_total = 0
author_fake_news = 0
author_total = 0

for i in range(len(authors)):
    if not type(authors[i]) == str:
        no_author_counter += 1
        if data["type"][i] == "fake":
            no_author_fake_news += 1
        no_author_total += 1
    elif data["type"][i] == "fake":
        author_fake_news += 1
        author_total += 1
    else: 
        author_total += 1
    
        
print(int(no_author_counter*100/len(authors)), "% of the articles does not have an author")
print(int(no_author_fake_news*100/no_author_total), int(author_fake_news*100/author_total))