# Project Clickbait Detection

### Team members: Joshua Burris, Caleb Tong

## Discription



In [1]:
import math
import string
from collections import Counter
class language_model:
    def __init__(self, ngram=1) :
        """
        Initialize a language model
        
        Parameters:
        ngram specifies the type of model:  
        unigram (ngram = 1), bigram (ngram = 2) etc.
        """
        self.ngram = ngram
        
    def train(self, file_name) :
        self.story = self.clean_text(file_name)
        if self.ngram > 1:
            self.bigram = []
            for i in range(len(self.story) - 1):
                self.bigram.append(self.story[i] + ' ' + self.story[i+1])
            self.bigram = Counter(self.bigram)
        if self.ngram > 2:
            self.trigram = []
            for i in range(len(self.story) - self.ngram + 1):
                temp = self.story[i]
                for j in range(1, self.ngram):
                    temp += ' ' + self.story[i+j]
                self.trigram.append(temp)
            self.trigram = Counter(self.trigram)
        self.data_frequency = Counter(self.story)
        self.V = len(self.data_frequency)
        self.total_count = sum(self.data_frequency.values())
        #print(self.total_count, self.V, self.data_frequency)
    
    def test(self, file_name) :
        text = self.clean_text(file_name)
        
        non_entries, entries = 0, 0
        for i in range(len(text) - self.ngram + 1):
            temp = text[i]
            for j in range(1, self.ngram):
                temp += ' ' + text[i+j]
            data = {}
            if self.ngram == 1:
                data = self.data_frequency
            elif self.ngram == 2:
                data = self.bigram
            elif self.ngram == 3:
                data = self.trigram
            if data.setdefault(temp, 0) == 0:
                non_entries += 1
            entries += 1
        
        self.sparsity = non_entries / entries if entries != 0 else 0
        
        return self.perplexity(text)
    
    def probability(self, word1, words):
        if self.ngram == 1:
            return (self.C([word1]) + 1) / (self.total_count + self.V)
        else:
            return (self.C(words + [word1]) + 1) / (self.C(words) + self.V)
    
    def perplexity(self, text):
        return math.pow(2, self.entropy(text))
    
    def entropy(self, text):
        exp = 0
        for i in range(self.ngram - 1, len(text)) :
            prevW = text[i - self.ngram + 1 : i]
            exp += -math.log(self.probability(text[i], prevW), 2)     
        return exp / (len(text) - (self.ngram - 1)) if (len(text) - (self.ngram - 1)) != 0 else 0
    
    def C(self, words):
        size = len(words)
        words = ' '.join(words)
        if size == 1: return self.data_frequency.setdefault(words, 0)
        if size == 2: return self.bigram.setdefault(words, 0)
        if size == 3: return self.trigram.setdefault(words, 0)
        
        return None
    
    def clean_text(self, file_name):
        result = []
        if file_name[-4:] == '.txt':
            with open(file_name, 'r') as f:
                text = f.read()
                text = text.lower()
                result = []
                trantab = str.maketrans("?:!-", "... ")
                text = text.translate(trantab)
                trantab = str.maketrans('', '', string.punctuation.replace('.', ''))
                text = text.translate(trantab)
                text = text.replace('\n\n', '.')
                tokens = text.split('.')
                for token in tokens:
                    result += ['<s>'] + token.split() + [' </s>']
        else:
            result = ["<s>"] + file_name.lower().split() + ["</s>"]
        return result

In [2]:
def language_m(textFiles):
    model = language_model(3)
    model.train(textFiles)
    print('Train:', textFiles)
    print('Perplexity:', model.test(textFiles), '\t(on Test:' + textFiles + ')')
    #print(model.story)
    return model

In [3]:
def check(sentence, origin):
    return sentence in origin

In [4]:
import random
def runLM(dataFiles):
    print("\n<TRAIN>\n")
    cLM = language_m(dataFiles[0])
    ncLM = language_m(dataFiles[1])
    print("\n</TRAIN>\n")
    with open(dataFiles[2], 'r') as file:
        c_titles = file.read().split('\n\n')
    with open(dataFiles[3], 'r') as file:
        nc_titles = file.read().split('\n\n')
    
    titles = c_titles + nc_titles
    random.shuffle(titles)
    length = len(titles)
    i, num = 0, 0
    print("\n<TEST>\n")
    while i < 30:
        index = random.randint(0, length-1)
        sentence = titles[index]
        c_perp = cLM.test(sentence)
        nc_perp = ncLM.test(sentence)
        #print(c_perp, nc_perp, (c_perp - 9900)<150, (nc_perp - 15500)<0)
        
        
        
        ##TODO:Find the perplexity cutting off point substitute 3000 with that value or just change the condition
        if c_perp - 9900 < 150 and nc_perp - 15500 < 0:
            print("Evaluation: CLICKBAIT, Check: " + str(check(sentence, c_titles)) + 
                  "\n\t\tTitle: \"" + sentence + "\"" )#+ " Perp: " + str(abs((nc_perp-c_perp) - nc_perp//2)))
            if check(sentence, c_titles):
                num+=1
        else:
            print("Evaluation: NOT CLICKBAIT, Check: " + str(check(sentence, nc_titles)) + 
                  "\n\t\tTitle: \"" + sentence + "\"" )#+ " Perp: " + str(abs((nc_perp-c_perp) - nc_perp//2)))
            if check(sentence, c_titles):
                num+=1
        i += 1
    print("Accuracy:", num/30*100, "%")
    print("\n</TEST>\n")
    while True:
        sentence = input("Enter a sentence to predict whether it is clickbait or not ('stop' for stopping) : \n")

        if sentence == "stop":
            break;
        c_perp = cLM.test(sentence)
        nc_perp = ncLM.test(sentence)
        ##
        if c_perp - 9900 < 150 and nc_perp - 15500 < 0:
            print("\nEvaluation for title:\n\"" + sentence + "\"-> CLICKBAIT\n")
        else:
            print("\nEvaluation for title:\n\"" + sentence + "\"-> NOT CLICKBAIT\n")
        print("\n" + "____________________" + "\n")

In [5]:
dataFiles = ["train/clickbait_data1.txt", "train/non_clickbait_data1.txt", "test/clickbait_data2.txt", "test/non_clickbait_data2.txt"]

runLM(dataFiles)


<TRAIN>

Train: train/clickbait_data1.txt
Perplexity: 2156.956516117808 	(on Test:train/clickbait_data1.txt)
Train: train/non_clickbait_data1.txt
Perplexity: 4913.472183920156 	(on Test:train/non_clickbait_data1.txt)

</TRAIN>


<TEST>

Evaluation: CLICKBAIT, Check: True
		Title: "Do You Know The Colours Missing From These Flags"
Evaluation: CLICKBAIT, Check: True
		Title: "This Is What Other Countries Truly Think About Halloween In The U.S"
Evaluation: NOT CLICKBAIT, Check: False
		Title: "Radiohead Release Rejected Bond Theme Song And It's Haunting"
Evaluation: NOT CLICKBAIT, Check: True
		Title: "World's smallest car enters Ripley's Believe it or Not museum"
Evaluation: NOT CLICKBAIT, Check: False
		Title: "15 Harper Lee-Inspired Tattoos Because We'll Never Forget"
Evaluation: CLICKBAIT, Check: True
		Title: "How To Make Friends According To Science"
Evaluation: CLICKBAIT, Check: True
		Title: "People Are Outraged Over This Mannequin With Impossibly Thin Legs"
Evaluation: CLICKBAIT

## Results

As seen in the above tests, the accuracy for the test on 30 titles is that are not trained before, is roughly about 73.33%. This is relatively good. The test titles below are handtyped titles that are harder to decipher if it is clickbait or not. For all intense and purposes, the program is able to get almost half right. 

The handtyped titles actual results are:

| Title | Evaluation | Actual |
|-|-|-|
| Brexit Is Going to Get Done. But on Whose Terms? | CLICKBAIT | NOT CLICKBAIT |
| The Yoga Master Who Might Be Chelsea’s Secret Weapon | NOT CLICKBAIT | NOT CLICKBAIT |
| These Stand-Ups Are in an Escapist Mood, Much Like Their Audience | NOT CLICKBAIT | NOT CLICKBAIT |
| Would You Let a Robot Take Care of Your Mother? | CLICKBAIT | NOT CLICKBAIT |
| See the $1.5million Kickstarter - only 1 day left | NOT CLICKBAIT | CLICKBAIT |
| The LAST Shopify Fulfillment Solution You'll Ever Need | CLICKBAIT | CLICKBAIT |
| Live on Kickstarter: Brew World-Class Coffee at a Push of a Button | NOT CLICKBAIT | CLICKBAIT |
| What is YOUR home worth? | CLICKBAIT | CLICKBAIT |
| Snoring and sleep apnea can be fatal | NOT CLICKBAIT | CLICKBAIT |

As a result, it is not the best. But some of the titles 