In [187]:
import joblib
import pandas as pd
import re
import numpy as np
import statistics
from readability import Readability
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

import pickle

pipeline = joblib.load("./data/knn_pipeline.joblib")

# features to be used 
# verbs_third_person, verbs_others, words_per_sentence_median, num_of_sentences, adverbs_rate, nouns_rate, 
# adjectives_rate, verbs_third_person_rate, verbs_others_rate, automatic_readability 

In [188]:
# convert string into data point
# run prediction
class Content_Predictor:
    def __init__(self, pipeline) -> None:
        self.pipeline: Pipeline = pipeline
        self.adverbs = ["RB", "RBR", "RBS", "WRB"]
        self.nouns = ["NN", "NNS"]
        self.proper_nouns = ["NNP", "NNPS"]
        self.adjective = ["JJ", "JJR", "JJS"]
        self.conjunctions = ["CC"]
        self.verbs_third_person = ["VB", "VBD", "VBG", "VBN"]
        self.verbs_others = ["VBP", "VBZ"]
        self.data = pd.DataFrame()

    def __combine_columns(self, row):
        return str(row["title"]) + ". " + str(row["text"])

    def process_readability(self, text_iterator):
        readability_ls = []
        for text in text_iterator:
            try:
                # remove non ascii (language english only)
                t = re.sub(r"[^\x00-\x7F]", " ", text)
                # remove multiple spaces
                t = re.sub(r" +", " ", t)
                # remove newline
                t = re.sub(r"\n", " ", t)
                # clear trailing whitespaces
                t = t.strip()
                # lowercase
                t = t.lower()
                if len(t) == 0:
                    readability_ls.append(np.NaN)
                else:
                    r = Readability(t)
                    readability_ls.append(r.ari().score)
            except:
                readability_ls.append(np.NaN)
                # print(text)
                continue
        print("Complete process ari")
        return readability_ls

    def process_content_sentences(self, text_iterator):
        sentence_median_length = []
        num_of_sentences = []
        for text in text_iterator:
            try:
                # remove some supers
                t = re.sub("[^a-zA-Z0-9\.\?\!]", " ", text)
                # remove multiple spaces
                t = re.sub(r" +", " ", t)
                # remove newline
                t = re.sub(r"\n", " ", t)
                # clear trailing whitespaces
                t = t.strip()
                # lowercase
                t = t.lower()
                # tokenise sentences
                t = re.split("\!|\.|\?", t)
                # strip sentence
                t = [item.strip() for item in t]
                # drop empty string
                t = list(filter(lambda x: x != "", t))
                # tokenise words
                sentences_tokenised = []
                sentences_lengths = []
                for sentence in t:
                    # tokenise
                    s = sentence.split(" ")
                    sentences_lengths.append(len(s))
                    sentences_tokenised.append(s)
                # sentence features
                sentence_median_length.append(statistics.median(sentences_lengths))
                num_of_sentences.append(len(sentences_tokenised))
                # print(sentences_tokenised)
            except Exception as e:
                print(e)
                sentence_median_length.append(0)
                num_of_sentences.append(0)
                continue
        print("Complete process sentences")
        return sentence_median_length, num_of_sentences

    def __count_word_pos(self, doc, word_type):
        count = 0
        for word in doc:
            if word[1] in word_type:
                count += 1
        return count

    def process_pos(self, text_iterator):
        stop_words = set(stopwords.words("english"))
        text_processed = []  # no stop words, as pos tokens
        for text in text_iterator:
            try:
                # remove punctuation
                t = re.sub("[^a-zA-Z0-9]", " ", text)
                # remove multiple spaces
                t = re.sub(r" +", " ", t)
                # remove newline
                t = re.sub(r"\n", " ", t)
                # clear trailing whitespaces
                t = t.strip()
                # lowercase
                t = t.lower()
                # tokenise
                t = t.split(" ")
                # drop empty string
                t = list(filter(lambda x: x != "", t))
                no_stop_words = []
                for token in t:
                    if token not in stop_words:
                        no_stop_words.append(token)
                try:
                    text_processed.append(pos_tag(no_stop_words))
                except:
                    text_processed.append([])
            except:
                text_processed.append(None)
                continue
        return text_processed

    def predict(self, drop_label=False):
        data_input = self.data.dropna()
        if drop_label:
            data = data_input.drop(columns="label").values
        else:
            data = data_input.values
        prediction = self.pipeline.predict(data)
        return prediction

    def score(self):
        # check for label data column
        print(list(self.data.columns))       
        data_input = self.data.dropna()
        score = self.pipeline.score(
            data_input.drop(columns="label").values, data_input["label"].values
        )
        return score

    def __load_data_inner(self, dirty_merged_data):
        self.data["words_per_sentence_median"], self.data["num_of_sentences"] = (
            self.process_content_sentences(dirty_merged_data)
        )
        pos_tokens = self.process_pos(dirty_merged_data)
        self.data["adverbs"] = [
            self.__count_word_pos(doc, self.adverbs) for doc in pos_tokens
        ]
        self.data["nouns"] = [
            self.__count_word_pos(doc, self.nouns) for doc in pos_tokens
        ]
        self.data["adjectives"] = [
            self.__count_word_pos(doc, self.adjective) for doc in pos_tokens
        ]
        self.data["verbs_third_person"] = [
            self.__count_word_pos(doc, self.verbs_third_person) for doc in pos_tokens
        ]
        self.data["verbs_others"] = [
            self.__count_word_pos(doc, self.verbs_others) for doc in pos_tokens
        ]

        self.data["adverbs_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["adverbs"] / self.data["num_of_sentences"],
        )
        self.data["nouns_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["nouns"] / self.data["num_of_sentences"],
        )
        self.data["adjectives_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["adjectives"] / self.data["num_of_sentences"],
        )
        self.data["verbs_third_person_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["verbs_third_person"] / self.data["num_of_sentences"],
        )
        self.data["verbs_others_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["verbs_others"] / self.data["num_of_sentences"],
        )
        # print(self.data.shape)
        self.data = self.data.drop(columns=["adverbs", "nouns", "adjectives"])
        # print(self.data.shape)
        return

    def load_content_data(self, data: pd.DataFrame) -> None:
        # we take in the dataframe of untouched csv
        self.data["label"] = data["label"]
        dirty_merged_data = data.apply(self.__combine_columns, axis=1)
        self.__load_data_inner(dirty_merged_data)
        # reorganise the table format.
        self.data = self.data.reindex(
            [
                "label",
                "verbs_third_person",
                "verbs_others",
                "words_per_sentence_median",
                "num_of_sentences",
                "adverbs_rate",
                "nouns_rate",
                "adjectives_rate",
                "verbs_third_person_rate",
                "verbs_others_rate",
            ],
            axis=1,
        )
        print("finished load_content_data()")
        return

    def load_readability_data(self, data: pd.DataFrame):
        if (len(self.data.columns)) == 0:
            print("running load_content_data first")
            self.load_content_data(data)            
        dirty_merged_data = data.apply(self.__combine_columns, axis=1)        
        readability_ls = self.process_readability(dirty_merged_data)
        self.data["ari"] = readability_ls
        print("finish load_readability_data")
        return

    def load_content_string(self, string: str) -> None:
        self.data = pd.DataFrame()
        self.__load_data_inner(string)
        print("finish load_content_string")
        # reorganise the table format.
        self.data = self.data.reindex(
            [
                "verbs_third_person",
                "verbs_others",
                "words_per_sentence_median",
                "num_of_sentences",
                "adverbs_rate",
                "nouns_rate",
                "adjectives_rate",
                "verbs_third_person_rate",
                "verbs_others_rate",
            ],
            axis=1,
        )
        return 
    
    def load_readability_string(self, string: str) -> None:
        print("running load_content_data first")
        self.load_content_string(string)
        readability_ls = self.process_readability(string)
        # print(self.data.shape)
        # print(readability_ls)
        self.data["ari"] = readability_ls
        print("finish load_readability_string")
        return 

In [195]:
predictor = Content_Predictor(pipeline)
data = pd.read_csv("./data/WELFake_Dataset.csv")
data = data.sample(frac=0.01)
test = ["The Francis Scott Key Bridge was a steel arch continuous through truss bridge that spanned the lower Patapsco River and outer Baltimore Harbor / Port carrying the Baltimore Beltway (Interstate 695 or I-695) between Hawkins Point, an isolated southern neighborhood of Baltimore, and Dundalk in Maryland, United States. The crossing between Baltimore City and Baltimore County also passed through a small portion of Anne Arundel County. The main spans and part of the northeastern approach of the bridge collapsed on March 26, 2024 after the container ship MV Dali struck one of its piers. n the 1960s, the Maryland State Roads Commission concluded a need for a second harbor crossing after the earlier Baltimore Harbor Thruway and Tunnel opened in 1957. They began planning another single-tube tunnel under the Patapsco River, further to the southeast, downstream from the Baltimore Harbor Tunnel. The proposed site was between Hawkins Point and Sollers Point in the outer harbor. Plans also were under way for a drawbridge to the south over Curtis Creek, replacing an earlier 1931 drawbridge carrying Pennington Avenue over the creek, to connect Hawkins Point to Sollers Point."]
# test = ["this is a test. um oops! not really? haha haha!","thats the rub. whats a rub?? rubber# duck duck!"]
# predictor.process_content_sentences(test)
# print(predictor.process_pos(test))
# predictor.process_readability(test)
predictor.load_content_data(data)

Complete process sentences
finished load_content_data()


In [196]:
print(predictor.data.shape)
print(predictor.score())
predictor.data.head()

(721, 10)
['label', 'verbs_third_person', 'verbs_others', 'words_per_sentence_median', 'num_of_sentences', 'adverbs_rate', 'nouns_rate', 'adjectives_rate', 'verbs_third_person_rate', 'verbs_others_rate']
0.9431345353675451


Unnamed: 0,label,verbs_third_person,verbs_others,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate
17635,1,26,16,28.5,12,0.75,9.333333,4.0,2.166667,1.333333
32078,1,52,25,18.0,35,0.485714,6.2,2.571429,1.485714,0.714286
1510,1,111,67,16.5,80,0.525,3.9875,1.975,1.3875,0.8375
5272,1,47,18,23.5,22,1.0,5.772727,2.454545,2.136364,0.818182
26132,1,99,41,12.0,74,0.540541,3.72973,1.716216,1.337838,0.554054


In [197]:
predictor.load_readability_data(data)

Complete process ari
finish load_readability_data


In [198]:
predictor.pipeline = joblib.load("./data/knn_pipeline_ari.joblib")
print(predictor.data.shape)
print(predictor.score())
predictor.data.head()

(721, 11)
['label', 'verbs_third_person', 'verbs_others', 'words_per_sentence_median', 'num_of_sentences', 'adverbs_rate', 'nouns_rate', 'adjectives_rate', 'verbs_third_person_rate', 'verbs_others_rate', 'ari']
0.957613814756672


Unnamed: 0,label,verbs_third_person,verbs_others,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate,ari
17635,1,26,16,28.5,12,0.75,9.333333,4.0,2.166667,1.333333,24.406113
32078,1,52,25,18.0,35,0.485714,6.2,2.571429,1.485714,0.714286,20.58679
1510,1,111,67,16.5,80,0.525,3.9875,1.975,1.3875,0.8375,11.442837
5272,1,47,18,23.5,22,1.0,5.772727,2.454545,2.136364,0.818182,17.051905
26132,1,99,41,12.0,74,0.540541,3.72973,1.716216,1.337838,0.554054,11.77406


In [199]:
predictor.pipeline = joblib.load("./data/knn_pipeline_ari.joblib")
predictor.load_readability_string(test)
print(predictor.data.shape)
print(predictor.predict())
predictor.data.head()

running load_content_data first
Complete process sentences
finish load_content_string
Complete process ari
finish load_readability_string
(1, 10)
[0]


Unnamed: 0,verbs_third_person,verbs_others,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate,ari
0,14,7,27,7,0.714286,9.0,2.857143,2.0,1.0,18.610071


In [200]:
predictor.pipeline = joblib.load("./data/knn_pipeline.joblib")
predictor.load_content_string(test)
print(predictor.data.shape)
print(predictor.predict())
predictor.data.head()

Complete process sentences
finish load_content_string
(1, 9)
[0]


Unnamed: 0,verbs_third_person,verbs_others,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate
0,14,7,27,7,0.714286,9.0,2.857143,2.0,1.0
