In [16]:
import os
import json
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

nltk.download("stopwords")


def load_pheme_dataset(path_to_pheme):
    data = []

    for event in os.listdir(path_to_pheme):
        event_path = os.path.join(path_to_pheme, event)

        if os.path.isdir(event_path):
            for rumor_type in ["rumours", "non-rumours"]:
                rumor_path = os.path.join(event_path, rumor_type)
                for tweet_id in os.listdir(rumor_path):
                    tweet_file = os.path.join(rumor_path, tweet_id, "source-tweets", f"{tweet_id}.json")

                    if os.path.isfile(tweet_file):
                        with open(tweet_file, "r") as file:
                            tweet_data = json.load(file)
                            text = tweet_data["text"]
                            label = 1 if rumor_type == "rumours" else 0
                            user_mentions = " ".join([mention["screen_name"] for mention in tweet_data["entities"]["user_mentions"]])
                            data.append({"id": tweet_id, "label": label, "text": text, "user_mentions": user_mentions})

    return pd.DataFrame(data)


def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Keep only alphabetical characters and whitespace
    text = text.lower()  # Convert to lowercase
    return text


def data_pipeline(csv_file):
    data = pd.read_csv(csv_file)
    data["text"] = data["text"].apply(preprocess_text)

    # Only keep label and text columns
    data = data[["label", "text"]]

    # Save the processed data
    data.to_csv("pheme_processed_data.csv", index=False)

    return data


path_to_pheme = "./PHEME/all-rnr-annotated-threads"
data = load_pheme_dataset(path_to_pheme)

# Save the data to a CSV file
data.to_csv("pheme_data.csv", index=False)

# Load data from the CSV file and run the data pipeline
data_pipeline("pheme_data.csv")



[nltk_data] Downloading package stopwords to /Users/Alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,text
0,1,breaking a germanwings airbus a has crashed in...
1,1,breaking germanwings ceo plane victims include...
2,1,a germanwings flight u registration daipx was ...
3,1,germanwings copilot suffered serious depressio...
4,1,passenger plane carrying people crashes in th...
...,...,...
6420,0,franz marc horses updategurlitt nazitainted ar...
6421,0,munich district court has confirmed the applic...
6422,0,where should the gurlitt collection go many p...
6423,0,possible nazi art transfer riles jewish groups...


In [20]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def load_buzzfeed_dataset(buzzfeed_real_path, buzzfeed_fake_path):
    buzzfeed_real = pd.read_csv(buzzfeed_real_path)
    buzzfeed_fake = pd.read_csv(buzzfeed_fake_path)
    buzzfeed_df = pd.concat([buzzfeed_real, buzzfeed_fake])
    del buzzfeed_real, buzzfeed_fake

    buzzfeed_df["type"] = buzzfeed_df["id"].apply(lambda x: x.split("_")[0])
    buzzfeed_df = buzzfeed_df[["id", "title", "text", "source", "type", "images", "movies"]]
    buzzfeed_df["movies"] = buzzfeed_df["movies"].apply(lambda x: 1 if not pd.isna(x) else 0)
    buzzfeed_df["images"] = buzzfeed_df["images"].apply(lambda x: 1 if not pd.isna(x) else 0)

    return buzzfeed_df

def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    return text

def data_pipeline(csv_file):
    data = pd.read_csv(csv_file)
    #print(data['type'])
    #input()
    data["text"] = data["text"].apply(preprocess_text)

    data["label"] = data["type"].apply(lambda x: 1 if x == "Fake" else 0)

    
    # Only keep label and text columns
    data = data[["label", "text"]]

    # Save the processed data
    data.to_csv("buzz_processed_data.csv", index=False)

    return data

buzzfeed_real_path = "./buzzfeed/BuzzFeed_real_news_content.csv"
buzzfeed_fake_path = "./buzzfeed/BuzzFeed_fake_news_content.csv"
data = load_buzzfeed_dataset(buzzfeed_real_path, buzzfeed_fake_path)

# Save the data to a CSV file
data.to_csv("buzzfeed_data.csv", index=False)

# Load data from the CSV file and run the data pipeline
data_pipeline("buzzfeed_data.csv")




Unnamed: 0,label,text
0,0,on saturday september at pm est an explosion...
1,0,less than a day after protests over the police...
2,0,obama to un giving up liberty enhances securit...
3,0,getty images wealth of nations trump vs clinto...
4,0,president obama today vetoed a bill that would...
...,...,...
177,1,hillarys top donor country just auctioned off ...
178,1,advertisement story continues below\n\nthe fi...
179,1,well thats weird if the birther movement is ra...
180,1,\n\ntheres a lot to be discussed about last ni...


In [18]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def load_liar_dataset(train_path, test_path, valid_path):
    col_names = ["id", "label", "statement", "subject", "speaker", "speaker_title", "state", "party",
                 "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts",
                 "context"]
    train_data = pd.read_csv(train_path, sep='\t', header=None, names=col_names)
    test_data = pd.read_csv(test_path, sep='\t', header=None, names=col_names)
    valid_data = pd.read_csv(valid_path, sep='\t', header=None, names=col_names)
    
    data = pd.concat([train_data, test_data, valid_data])
    return data

def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    return text

def data_pipeline(csv_file):
    data = pd.read_csv(csv_file)
    data["statement"] = data["statement"].apply(preprocess_text)

    
    data["text"] = data["statement"]
    data["label"] = data["label"].apply(lambda x: 1 if x == "false" or x == "pants-fire" else 0)

    # Only keep label and text columns
    data = data[["label", "text"]]

    # Save the processed data
    data.to_csv("liar_processed_data.csv", index=False)

    return data

train_path = "./liar_dataset/train.tsv"
test_path = "./liar_dataset/test.tsv"
valid_path = "./liar_dataset/valid.tsv"
data = load_liar_dataset(train_path, test_path, valid_path)

# Save the data to a CSV file
data.to_csv("liar_data.csv", index=False)

# Load data from the CSV file and run the data pipeline
data_pipeline("liar_data.csv")

Unnamed: 0,label,text
0,1,says the annies list political group supports ...
1,0,when did the decline of coal start it started ...
2,0,hillary clinton agrees with john mccain by vot...
3,1,health care reform legislation is likely to ma...
4,0,the economic turnaround started at the end of ...
...,...,...
12786,0,for the first time in more than a decade impor...
12787,0,says donald trump has bankrupted his companies...
12788,0,john mccain and george bush have absolutely no...
12789,1,a new poll shows percent support the presiden...
