In [39]:
import joblib
import pandas as pd
import re
import numpy as np
import statistics
from readability import Readability
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

import pickle

pipeline = joblib.load("./data/knn_pipeline.joblib")

# features to be used 
# verbs_third_person, verbs_others, words_per_sentence_median, num_of_sentences, adverbs_rate, nouns_rate, 
# adjectives_rate, verbs_third_person_rate, verbs_others_rate, automatic_readability 

In [40]:
# convert string into data point
# run prediction
class Content_Predictor:
    def __init__(self, pipeline) -> None:
        self.pipeline: Pipeline = pipeline
        self.adverbs = ["RB", "RBR", "RBS", "WRB"]
        self.nouns = ["NN", "NNS"]
        self.proper_nouns = ["NNP", "NNPS"]
        self.adjective = ["JJ", "JJR", "JJS"]
        self.conjunctions = ["CC"]
        self.verbs_third_person = ["VB", "VBD", "VBG", "VBN"]
        self.verbs_others = ["VBP", "VBZ"]
        self.data = pd.DataFrame()

    def __combine_columns(self, row):
        return str(row["title"]) + ". " + str(row["text"])

    def process_readability(self, text_iterator):
        readability_ls = []
        for text in text_iterator:
            try:
                # remove non ascii (language english only)
                t = re.sub(r"[^\x00-\x7F]", " ", text)
                # remove multiple spaces
                t = re.sub(r" +", " ", t)
                # remove newline
                t = re.sub(r"\n", " ", t)
                # clear trailing whitespaces
                t = t.strip()
                # lowercase
                t = t.lower()
                if len(t) == 0:
                    readability_ls.append(np.NaN)
                else:
                    r = Readability(t)
                    readability_ls.append(r.ari().score)
            except:
                readability_ls.append(np.NaN)
                # print(text)
                continue
        print("Complete process ari")
        return readability_ls

    def process_content_sentences(self, text_iterator):
        sentence_median_length = []
        num_of_sentences = []
        for text in text_iterator:
            try:
                # remove some supers
                t = re.sub("[^a-zA-Z0-9\.\?\!]", " ", text)
                # remove multiple spaces
                t = re.sub(r" +", " ", t)
                # remove newline
                t = re.sub(r"\n", " ", t)
                # clear trailing whitespaces
                t = t.strip()
                # lowercase
                t = t.lower()
                # tokenise sentences
                t = re.split("\!|\.|\?", t)
                # strip sentence
                t = [item.strip() for item in t]
                # drop empty string
                t = list(filter(lambda x: x != "", t))
                # tokenise words
                sentences_tokenised = []
                sentences_lengths = []
                for sentence in t:
                    # tokenise
                    s = sentence.split(" ")
                    sentences_lengths.append(len(s))
                    sentences_tokenised.append(s)
                # sentence features
                sentence_median_length.append(statistics.median(sentences_lengths))
                num_of_sentences.append(len(sentences_tokenised))
                # print(sentences_tokenised)
            except Exception as e:
                print(e)
                sentence_median_length.append(0)
                num_of_sentences.append(0)
                continue
        print("Complete process sentences")
        return sentence_median_length, num_of_sentences

    def __count_word_pos(self, doc, word_type):
        count = 0
        for word in doc:
            if word[1] in word_type:
                count += 1
        return count

    def process_pos(self, text_iterator):
        stop_words = set(stopwords.words("english"))
        text_processed = []  # no stop words, as pos tokens
        for text in text_iterator:
            try:
                # remove punctuation
                t = re.sub("[^a-zA-Z0-9]", " ", text)
                # remove multiple spaces
                t = re.sub(r" +", " ", t)
                # remove newline
                t = re.sub(r"\n", " ", t)
                # clear trailing whitespaces
                t = t.strip()
                # lowercase
                t = t.lower()
                # tokenise
                t = t.split(" ")
                # drop empty string
                t = list(filter(lambda x: x != "", t))
                no_stop_words = []
                for token in t:
                    if token not in stop_words:
                        no_stop_words.append(token)
                try:
                    text_processed.append(pos_tag(no_stop_words))
                except:
                    text_processed.append([])
            except:
                text_processed.append(None)
                continue
        return text_processed

    def predict(self):
        data_input = self.data.dropna()
        prediction = self.pipeline.predict(data_input.drop(columns="label").values)
        return prediction

    def score(self):
        data_input = self.data.dropna()
        score = self.pipeline.score(
            data_input.drop(columns="label").values, data_input["label"].values
        )
        return score

    def load_data(self, data: pd.DataFrame) -> None:
        # we take in the dataframe of untouched csv
        # df = pd.read_csv("./data/WELFake_Dataset.csv")
        self.data["label"] = data["label"]
        dirty_merged_data = data.apply(self.__combine_columns, axis=1)
        self.data["words_per_sentence_median"], self.data["num_of_sentences"] = (
            self.process_content_sentences(dirty_merged_data)
        )
        pos_tokens = self.process_pos(dirty_merged_data)
        self.data["adverbs"] = [
            self.__count_word_pos(doc, self.adverbs) for doc in pos_tokens
        ]
        self.data["nouns"] = [
            self.__count_word_pos(doc, self.nouns) for doc in pos_tokens
        ]
        self.data["adjectives"] = [
            self.__count_word_pos(doc, self.adjective) for doc in pos_tokens
        ]
        self.data["verbs_third_person"] = [
            self.__count_word_pos(doc, self.verbs_third_person) for doc in pos_tokens
        ]
        self.data["verbs_others"] = [
            self.__count_word_pos(doc, self.verbs_others) for doc in pos_tokens
        ]

        self.data["adverbs_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["adverbs"] / self.data["num_of_sentences"],
        )
        self.data["nouns_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["nouns"] / self.data["num_of_sentences"],
        )
        self.data["adjectives_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["adjectives"] / self.data["num_of_sentences"],
        )
        self.data["verbs_third_person_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["verbs_third_person"] / self.data["num_of_sentences"],
        )
        self.data["verbs_others_rate"] = np.where(
            self.data["num_of_sentences"] == 0,
            0,
            self.data["verbs_others"] / self.data["num_of_sentences"],
        )
        print(self.data.shape)
        self.data = self.data.drop(columns=["adverbs", "nouns", "adjectives"])
        print(self.data.shape)

        # reorganise the table format.
        self.data = self.data.reindex(
            [
                "label",
                "verbs_third_person",
                "verbs_others",
                "words_per_sentence_median",
                "num_of_sentences",
                "adverbs_rate",
                "nouns_rate",
                "adjectives_rate",
                "verbs_third_person_rate",
                "verbs_others_rate",
            ],
            axis=1,
        )

        return

    def load_readability_data(self, data: pd.DataFrame):
        dirty_merged_data = data.apply(self.__combine_columns, axis=1)
        readability_ls = self.process_readability(dirty_merged_data)
        self.data["ari"] = readability_ls
        return 

In [41]:
predictor = Content_Predictor(pipeline)
data = pd.read_csv("./data/WELFake_Dataset.csv")
data = data.sample(frac=0.1)
# test = ["William Shakespeare was born on April 23, 1564, in Stratford-upon-Avon. The son of John Shakespeare and Mary Arden, he was probably educated at the King Edward VI Grammar School in Stratford, where he learned Latin and a little Greek and read the Roman dramatists. At eighteen, he married Anne Hathaway, a woman seven or eight years his senior. Together, they raised two daughters: Susanna, who was born in 1583, and Judith (whose twin brother died in boyhood), born in 1585. Little is known about Shakespeare’s activities between 1585 and 1592. Robert Greene’s A Groatsworth of Wit alludes to him as an actor and playwright. Shakespeare may have taught at school during this period, but it seems more probable that shortly after 1585 he went to London to begin his apprenticeship as an actor. Due to the plague, the London theaters were often closed between June 1592 and April 1594. During that period, Shakespeare probably had some income from his patron, Henry Wriothesley, earl of Southampton, to whom he dedicated his first two poems, Venus and Adonis (1593) and The Rape of Lucrece (1594). The former was a long narrative poem depicting the rejection of Venus by Adonis, his death, and the consequent disappearance of beauty from the world. Despite conservative objections to the poem’s glorification of sensuality, it was immensely popular and was reprinted six times during the nine years following its publication."]
# test = ["this is a test. um oops! not really? haha haha!","thats the rub. whats a rub?? rubber# duck duck!"]
# predictor.process_content_sentences(test)
# print(predictor.process_pos(test))
# predictor.process_readability(test)
predictor.load_data(data)

no median for empty data
no median for empty data
Complete process sentences
(7213, 13)
(7213, 10)


In [46]:
print(predictor.data.shape)
print(predictor.score())
predictor.data.head()

(7213, 10)
0.9606266463330099


Unnamed: 0,label,verbs_third_person,verbs_others,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate
64586,0,7,7,15.0,9,0.111111,3.666667,1.777778,0.777778,0.777778
22782,0,75,18,18.0,31,0.677419,5.548387,2.096774,2.419355,0.580645
56589,0,119,74,19.0,68,0.558824,5.911765,2.602941,1.75,1.088235
20171,1,6,5,14.0,7,0.285714,3.428571,2.142857,0.857143,0.714286
21010,1,55,19,15.5,38,0.657895,4.815789,1.473684,1.447368,0.5


In [42]:
predictor.load_readability_data(data)

Complete process ari


In [43]:
predictor.pipeline = joblib.load("./data/knn_pipeline_ari.joblib")
print(predictor.data.shape)
print(predictor.score())
predictor.data.head()

(7213, 11)
0.961556347853046


Unnamed: 0,label,verbs_third_person,verbs_others,words_per_sentence_median,num_of_sentences,adverbs_rate,nouns_rate,adjectives_rate,verbs_third_person_rate,verbs_others_rate,ari
64586,0,7,7,15.0,9,0.111111,3.666667,1.777778,0.777778,0.777778,8.595105
22782,0,75,18,18.0,31,0.677419,5.548387,2.096774,2.419355,0.580645,14.083138
56589,0,119,74,19.0,68,0.558824,5.911765,2.602941,1.75,1.088235,14.084243
20171,1,6,5,14.0,7,0.285714,3.428571,2.142857,0.857143,0.714286,8.897773
21010,1,55,19,15.5,38,0.657895,4.815789,1.473684,1.447368,0.5,17.007022
