# Preprocess for GermanFakeNC dataset

# Import necessary libraries  

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk 
import re 
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Preprocessing

## Loading GermanFakeNC.json

In [2]:
df = pd.read_json('../../Datasets/GermanFakeNC/GermanFakeNC.json')
df

Unnamed: 0,Date,URL,False_Statement_1_Location,False_Statement_1_Index,False_Statement_2_Location,False_Statement_2_Index,False_Statement_3_Location,False_Statement_3_Index,Ratio_of_Fake_Statements,Overall_Rating
0,2017-08-30,https://schluesselkindblog.com/2017/08/30/proz...,Text,213-237,,,,,1,0.7
1,2017-12-18,http://blauerbote.com/2017/12/18/bild-journali...,Text,13-36,Text,52-81,,,3,0.8
2,2017-06-02,http://blauerbote.com/2017/06/02/angela-merkel...,Title,1-7,Text,70-94,Text,121-153,3,0.7
3,2017-09-25,http://smopo.ch/deutschlands-neonazis-waehlen-...,Title,1-5,Text,28-53,Text,127-141,3,0.8
4,2018-02-17,http://www.truth24.net/gruppenvergewaltigung-s...,Title,1-1,Title,3-4,Title,6-7,1,0.4
...,...,...,...,...,...,...,...,...,...,...
485,2017-11-01,http://www.anonymousnews.ru/2017/11/01/sex-dsc...,Image,,Image,,Image,,3,0.8
486,2017-11-17,https://blog.halle-leaks.de/messer-jihad-in-ko...,Title,1-10,Image,,,,1,0.7
487,2017-10-29,https://blog.halle-leaks.de/erzieherin-will-vo...,Title,1-12,Text,1-32,Text,1-32,2,0.8
488,2017-12-30,https://www.compact-online.de/armer-martin-sch...,Text,132-142,Text,143-169,,,1,0.4


## Stemming

We used a German stemmer we found on github: https://github.com/LeonieWeissweiler/CISTEM  
And we created a function named stemmer that performs German stopword removal and stemming in a dataset entry.

In [6]:
stripge = re.compile(r"^ge(.{4,})")
replxx = re.compile(r"(.)\1")
replxxback = re.compile(r"(.)\*");
stripemr = re.compile(r"e[mr]$")
stripnd = re.compile(r"nd$")
stript = re.compile(r"t$")
stripesn = re.compile(r"[esn]$")


def stem(word, case_insensitive = False):
    if len(word) == 0:
        return word

    upper = word[0].isupper()
    word = word.lower()

    word = word.replace("ü","u")
    word = word.replace("ö","o")
    word = word.replace("ä","a")
    word = word.replace("ß","ss")

    word = stripge.sub(r"\1", word)
    word = word.replace("sch","$")
    word = word.replace("ei","%")
    word = word.replace("ie","&")
    word = replxx.sub(r"\1*", word)

    while len(word) > 3:
        if len(word) > 5:
            (word, success) = stripemr.subn("", word)
            if success != 0:
                continue

            (word, success) = stripnd.subn("", word)
            if success != 0:
                continue

        if not upper or case_insensitive:
            (word, success) = stript.subn("", word)
            if success != 0:
                continue

        (word, success) = stripesn.subn("", word)
        if success != 0:
            continue
        else:
            break

    word = replxxback.sub(r"\1\1", word)
    word = word.replace("%","ei")
    word = word.replace("&","ie")
    word = word.replace("$","sch")

    return word

def stemmer(title):
    review = re.sub('[^a-zA-ZäöüÄÖÜß]',' ', title)
    review = review.lower().split()
    review = [stem(word) for word in review if not word in stopwords.words('german')]
    review = ' '.join(review)
    return(review)

## Web Scraping and Stemming

We created a function that takes as input our dataframe and scrapes the title and the body of each article in the dataset if the access is not forbidden. It also performs stemming and stop word removal at each entry. Finally it classifies the data to 0(Real) or 1(Fake) depending on their Overall_Rating and the threshold. 

In [None]:
def create_dataset_fake_news(df, thresshold):
    titles = list()
    texts = list()
    scores = list()
    
    #Scraping
    for i in range(0, len(df)):
        try:
            page=requests.get(df['URL'][i]) 
        except Exception as e:       
            continue
        coverpage = page.content
        
        soup = BeautifulSoup(coverpage, 'html5lib')
        title = soup.find('title')
        if title is not None:
            title = soup.title.get_text()
    #Stemming and stopword removal         
        text = soup.body.get_text()
        if title is not None:
            title = stemmer(title)
        text = stemmer(text)
        
        titles.append(title)
        texts.append(text)
        if df['Overall_Rating'][i] <= thresshold:
            scores.append(0)
        else:
            scores.append(1)
    
    #Creating final dataframe and removing access forbidden websites
    df = pd.DataFrame(list(zip(titles, texts, scores)), columns = ['Title', 'Text', 'Fake-Real'])
    for i in range(0, len(df)):
        if df['Title'][i] == "forbidd":
            df = df.drop(i)
        
    df = df.reset_index(drop=True)

    for i in range(0, len(df)):
        if pd.isna(df['Title'][i]):
            df = df.drop(i)
        
    df = df.reset_index(drop=True)  
    return(df)

## Creating new dataset

In [30]:
df_preprocessed = create_dataset_fake_news(df, 0.5)

In [33]:
df_preprocessed

Unnamed: 0,Title,Text,Fake-Real
0,bild journali julia ropck nazifreu blau bot ma...,inhal spring blau bot magazi wissenschaf statt...,1
1,angela merkel lass rock ring abbrech blau bot ...,inhal spring blau bot magazi wissenschaf statt...,1
2,gruppenvergewaltigung sex jihadi vergewaltig j...,such hom tagesschau truth original meld abou u...,1
3,ozapf is turk versuch koreaneri wies vergewalt...,vergewaltigung asyla fluchtling migra rapefuge...,1
4,katalonie tag diktatur spanisch zentralstaa un...,katalonie tag diktatur spanisch zentralstaa un...,1
...,...,...,...
302,mehr sexattack armutsasyla uberfall frau krefe...,such hom tagesschau truth original meld abou u...,1
303,komm egal erzahl deutsch regierung berei umstr...,inhal spring fr feb th guido grand publizi aut...,1
304,amtlich meis terrori komm afghanista somalia s...,inhal spring fr feb th guido grand publizi aut...,1
305,sex dschihad ess fluchtling sturm hallowee par...,deutschlandinternationalmeinunghintergrundemed...,1


   ## Saving our dataset 

In [34]:
#Some statistics about our data
print("Real\n",df_preprocessed[df_preprocessed['Fake-Real'] == 0].count())
print("Fake\n",df_preprocessed[df_preprocessed['Fake-Real'] == 1].count())

In [21]:
df_preprocessed.to_csv('../../Datasets/GermanFakeNC/df_preprocessed_GermanFakeNC')

Fake-Real    77
dtype: int64
Fake-Real    236
dtype: int64
