# Clean the dataset

In [1]:
import os
import json

missing_files_list = []

directory = "fakenewsnet_dataset\gossipcop\\fake"
for filename in os.listdir(directory):
    file = os.path.join(directory, filename, "news content.json")
    try:
        with open(file, "r") as f:
            json_file = json.load(f)
    except FileNotFoundError:
        missing_files_list.append(os.path.join(directory, filename))

In [8]:
len(os.listdir(directory))

5323

In [11]:
missing_files_list[0]

'fakenewsnet_dataset\\gossipcop\\fake\\gossipcop-1000908841'

In [13]:
for item in missing_files_list:
    os.rmdir(item)

In [15]:
missing_files_list = []

directory = "fakenewsnet_dataset\gossipcop\\real"
for filename in os.listdir(directory):
    file = os.path.join(directory, filename, "news content.json")
    try:
        with open(file, "r") as f:
            json_file = json.load(f)
    except FileNotFoundError:
        missing_files_list.append(os.path.join(directory, filename))

In [17]:
len(os.listdir(directory))

16817

In [31]:
len(missing_files_list)

1674

In [32]:
import shutil

extra_files = []
for item in missing_files_list:
    try:
        shutil.rmtree(item)
    except OSError:
        extra_files.append(item)

# Make csv's

## Make text dataset
Use gossipcop

In [40]:
import os
import json
import numpy as np
import pandas as pd

missing_files_list = []
text = []
label = []
directory = "fakenewsnet_dataset\gossipcop\\real"
for filename in os.listdir(directory):
    file = os.path.join(directory, filename, "news content.json")
    with open(file, "r") as f:
        json_file = json.load(f)
        text.append(json_file["text"])

label.extend(np.ones(len(os.listdir(directory))))

directory = "fakenewsnet_dataset\gossipcop\\fake"
for filename in os.listdir(directory):
    file = os.path.join(directory, filename, "news content.json")
    with open(file, "r") as f:
        json_file = json.load(f)
        text.append(json_file["text"])
label.extend(np.zeros(len(os.listdir(directory))))

data = { "text" : text, "label" : label }

df = pd.DataFrame(data)
df.to_csv("dataset_text.csv")

# Make title dataset
Use gossipcop


In [93]:
x = split_text(df["text"].values.tolist()[1], 512)

In [107]:
print(len(df["text"].values.tolist()[1].split(" ")))
x = split_text(df["text"].values.tolist()[1], 512)
for i in x:
    print(len(i.split(" ")))

705
475
231


In [85]:
len(df["text"].values.tolist()[1].split(" "))

705

In [38]:
import os
import json
import numpy as np
import pandas as pd

text = []
label = []
directory = "fakenewsnet_dataset\gossipcop\\real"
for filename in os.listdir(directory):
    file = os.path.join(directory, filename, "news content.json")
    with open(file, "r") as f:
        json_file = json.load(f)
        text.append(json_file["title"])

label.extend(np.ones(len(os.listdir(directory))))

directory = "fakenewsnet_dataset\gossipcop\\fake"
for filename in os.listdir(directory):
    file = os.path.join(directory, filename, "news content.json")
    with open(file, "r") as f:
        json_file = json.load(f)
        text.append(json_file["title"])
label.extend(np.zeros(len(os.listdir(directory))))

data = { "text" : text, "label" : label }

df = pd.DataFrame(data)
df.to_csv("dataset_title.csv")

# Data understanding

In [89]:
df = pd.read_csv("dataset_text_normalized-m2-128.csv")
np.mean([len(str(x).split(" ")) for x in df["text"].values.tolist()])

105.38691784125623

### In medie sunt 573.6871098237027 intr-o propozitie nenormalizata.
### In medie sunt 402.1050809489869 intr-o propozitie normalizata cu lungimea maxima de 512.
### In medie sunt 105.38691784125623 intr-o propozitie normalizata cu lungimea maxima de 128.

In [66]:
df = pd.read_csv("dataset_title.csv")
np.mean([len(str(x).split(" ")) for x in df["text"].values.tolist()])

10.639314787993808

### In medie sunt 10.639314787993808 intr-un titlu.

# Make text normalized (to 512 words per sentence) dataset

## Text preprocessing function

for the moment we just clean the tabs and newlines

In [51]:
import re

def preprocessing(text:str) -> str:
    text = re.sub(r'[\t\n]', ' ', text) # remove tabs and newlines
    text = re.sub(r'[0-9][0-9]\.|[0-9]\.', ' ', text) # remove lists
    text = re.sub(r'\[[0-9]+\]', ' ', text) # remove citatinos - ex: [321]
    text = re.sub(r'\\', ' ', text) # remove '\' from word\'s
    text = re.sub(r'\[.+\]', ' ', text) # remove [word] - ex: [edit]
    text = re.sub(r'\(|\)', ' ', text) # remove ()
    text = re.sub(r'[=\-\+]+', ' ', text) # remove ()
    text = re.sub(r' +', ' ', text) # reduce spaces to one space
    text = re.sub(r'\.+', '.', text) # replace ... with .
    return text

In [50]:
def split_add_tags_text(text: str) -> list[str]:
    news = []
    final_article = "[CLS]"
    text = preprocessing(text)
    article = text.split(". ")
    for sentence in article:
        final_article = f"{final_article} {sentence}. [SEP]"
    
    final_article = re.sub('\.\.', '.', final_article) # replace ... with .
    return final_article

In [52]:
def split_text(text: str, length_max: int, length_min:int) -> list[str]:
    sentences = []
    num_tokens = len(text.split(". "))
    text = preprocessing(text)
    if len(text.split(" ")) > length_max - (num_tokens + 1):
        sentences_split = text.split(". ")
        total_words = 0
        final_sentence = ""
        for sentence in sentences_split:
            words = len(sentence.split(" "))
            if words + total_words > length_max - 2:
                final_sentence = final_sentence.strip()
                sentences.append(final_sentence)
                total_words = 0
                final_sentence = ""
        
            total_words += words
            final_sentence = f"{final_sentence}{sentence}. "

        final_sentence = final_sentence.strip()
        if len(final_sentence.split(" ")) > length_min:
            final_sentence = final_sentence.strip()
            sentences.append(final_sentence)
    
        sentences = [ split_add_tags_text(sentence) for sentence in sentences ]
        return sentences
    
    else:
        if len(text.split(" ")) > length_min:
            final_sentence = re.sub(r'\.+', '', text) # remove ...
            final_sentence = final_sentence.strip()
            final_sentence = final_sentence
            sentences.append(final_sentence)
            
            text = split_add_tags_text(text)
            return [text]
    
    return []

In [53]:
from typing import Tuple, Literal

def extract(directory: str, length_max:int, length_min:int, component: str, news_type: Literal["fake", "true"]) -> Tuple[list[str], list[str]]:
    text = []
    label = []
    for filename in os.listdir(directory):
        file = os.path.join(directory, filename, "news content.json")
        with open(file, "r") as f:
            json_file = json.load(f)
            news = split_text(text=json_file[component], length_max=length_max, length_min=length_min)
            try:
                news.remove("")
            except:
                pass
            text.extend(news)
            if news_type == "fake":
                label.extend(np.zeros(len(news)))
            if news_type == "true":
                label.extend(np.ones(len(news)))

    return text, label

In [54]:
import os
import json
import numpy as np
import pandas as pd

text = []
label = []
length_max = 512
length_min = 15


directory = "fakenewsnet_dataset\gossipcop\\real"
text_output, label_output = extract(directory=directory, 
                                    length_max=length_max, 
                                    length_min=length_min, 
                                    component="text", 
                                    news_type="true",
                                    )
text.extend(text_output)
label.extend(label_output)

directory = "fakenewsnet_dataset\gossipcop\\fake"
text_output, label_output = extract(directory=directory, 
                                    length_max=length_max, 
                                    length_min=length_min, 
                                    component="text", 
                                    news_type="fake",
                                    )
text.extend(text_output)
label.extend(label_output)


data = { "text" : text, "label" : label }

df = pd.DataFrame(data)
df.to_csv(f"dataset_text_normalized-m5-M{length_max}-m{length_min}.csv")

this function takes - 10s-30s

In [111]:
import os
import json
import numpy as np
import pandas as pd

text = []
label = []
length_max = 128
length_min = 5

directory = "fakenewsnet_dataset\gossipcop\\real"
text_output, label_output = extract(directory=directory, 
                                    length_max=length_max, 
                                    length_min=length_min, 
                                    component="title", 
                                    news_type="true",
                                    )
text.extend(text_output)
label.extend(label_output)

directory = "fakenewsnet_dataset\gossipcop\\fake"
text_output, label_output = extract(directory=directory, 
                                    length_max=length_max, 
                                    length_min=length_min, 
                                    component="title", 
                                    news_type="fake",
                                    )
text.extend(text_output)
label.extend(label_output)


data = { "text" : text, "label" : label }

df = pd.DataFrame(data)
df = df.dropna(subset=["text"])
df.to_csv("dataset_title_normalized-M128-m5.csv")

this function takes - 7s to execute

In [19]:
import pandas as pd

df = pd.read_csv("dataset_text_normalized-m3-128.csv")

In [104]:
text = df.iloc[792]["text"]

In [105]:
text


'When host Andy Cohen said that she was caught in a “megawatt lie,” Parks said, “What more can I do? I apologized already and the person I’m most concerned about is Porsha. I’m sorry that it hurt Kandi as well.” A source previously told Us in late April that the reunion episodes were “disastrous” for Parks. “It would be understandable if Phaedra didn’t come back, but it’s not finalized yet,” the source said. “It wouldn’t be Phaedra’s decision, but Bravo’s.'

In [61]:
re.sub(r'\.+', '.', text) # remove ...


"Still, Kesha's experience is a reminder to fans that things can always get better, even if it doesn't seem that way at first."

In [93]:
preprocessing(text).strip() # 328

'Britton, Swift\'s business savvy has helped her "excel as an authentic personality who establishes direct connections with her audience", "touch as many people as possible", and "generate a kind of advocacy and excitement that no level of advertising could". Swift is one of the most-followed people on social media. As of April 2021, she has approximately 176 million followers on Instagram, 6 million followers on Twitter 1 million subscribers on YouTube, and is also very active on Tumblr. She joined TikTok on August 23, 2021, becoming the fastest user to amass 100,000 followers after her first upload (in 34 minutes), and surpassed 5 million followers in the first day. Swift is known for her frequent and friendly online interactions with her fans. She has visited fans in hospitals and delivered holiday gifts to them by mail and in person, an event dubbed "Swiftmas", and considers it her "responsibility" to be conscious of her influence on young fans. She has called her relationship with