# Project Clickbait Detection

### Team members: Joshua Burris, Caleb Tong

In [1]:
import math
import string
from collections import Counter
class language_model:
    def __init__(self, ngram=1) :
        """
        Initialize a language model
        
        Parameters:
        ngram specifies the type of model:  
        unigram (ngram = 1), bigram (ngram = 2) etc.
        """
        self.ngram = ngram
        
    def train(self, file_name) :
        self.story = self.clean_text(file_name)
        if self.ngram > 1:
            self.bigram = []
            for i in range(len(self.story) - 1):
                self.bigram.append(self.story[i] + ' ' + self.story[i+1])
            self.bigram = Counter(self.bigram)
        if self.ngram > 2:
            self.trigram = []
            for i in range(len(self.story) - self.ngram + 1):
                temp = self.story[i]
                for j in range(1, self.ngram):
                    temp += ' ' + self.story[i+j]
                self.trigram.append(temp)
            self.trigram = Counter(self.trigram)
        self.data_frequency = Counter(self.story)
        self.V = len(self.data_frequency)
        self.total_count = sum(self.data_frequency.values())
        #print(self.total_count, self.V, self.data_frequency)
    
    def test(self, file_name) :
        text = self.clean_text(file_name)
        
        non_entries, entries = 0, 0
        for i in range(len(text) - self.ngram + 1):
            temp = text[i]
            for j in range(1, self.ngram):
                temp += ' ' + text[i+j]
            data = {}
            if self.ngram == 1:
                data = self.data_frequency
            elif self.ngram == 2:
                data = self.bigram
            elif self.ngram == 3:
                data = self.trigram
            if data.setdefault(temp, 0) == 0:
                non_entries += 1
            entries += 1
        
        self.sparsity = non_entries / entries
        
        return self.perplexity(text)
    
    def probability(self, word1, words):
        if self.ngram == 1:
            return (self.C([word1]) + 1) / (self.total_count + self.V)
        else:
            return (self.C(words + [word1]) + 1) / (self.C(words) + self.V)
    
    def perplexity(self, text):
        return math.pow(2, self.entropy(text))
    
    def entropy(self, text):
        exp = 0
        for i in range(self.ngram - 1, len(text)) :
            prevW = text[i - self.ngram + 1 : i]
            exp += -math.log(self.probability(text[i], prevW), 2)     
        return exp / (len(text) - (self.ngram - 1))
    
    def C(self, words):
        size = len(words)
        words = ' '.join(words)
        if size == 1: return self.data_frequency.setdefault(words, 0)
        if size == 2: return self.bigram.setdefault(words, 0)
        if size == 3: return self.trigram.setdefault(words, 0)
        
        return None
    
    def clean_text(self, file_name):
        result = []
        with open(file_name, 'r') as f:
            text = f.read()
            text = text.lower()
            result = []
            trantab = str.maketrans("?:!-", "... ")
            text = text.translate(trantab)
            trantab = str.maketrans('', '', string.punctuation.replace('.', ''))
            text = text.translate(trantab)
            text = text.replace('\n\n', '.')
            tokens = text.split('.')
            for token in tokens:
                result += ['<s>'] + token.split() + [' </s>']
        return result

In [2]:
def language_m(textFiles):
    for i in range(1, 4):
        print('\n\nNgram:', i)
        model = language_model(i)
        for file1 in textFiles:
            model.train(file1)
            print('\nTrain:', file1)
            for file2 in textFiles:
                print('Perplexity:', model.test(file2), '\t(on Test:' + file2 + ')')
            #print(model.story)
            if i > 1:
                outfile = 'model'+str(i)+'.txt'
                with open(outfile, encoding='utf-8', mode='w') as fp: 
                    for tag, count in model.bigram.items():  
                        fp.write('{}\t{}\n'.format(count, ''.join(tag.replace(' ', '\t'))))

In [3]:
import pickle, os, json
from collections import OrderedDict

def getProb(fileName):
    corpusPath = ""
    conditionalProbabilityFile = "conditionalProbabilityDict.p"
    bigramsListPath = "bigramsList.p"
    unigramProbFile = "unigramProbDict.p"
    unigramsDictPath = "unigramsDict.p"

    with open(corpusPath+fileName, encoding = "ISO-8859-1") as f:
        lines = f.readlines()

    bigramsList = [] # List of all bigrams along with counts. [ ["24","hello","world"] , [ ... ], ...]
    unigramsDict = OrderedDict() # key : unigram, value : count
    singleLine = [] # a temporary variable

    for line in lines:
        # removing \n and \r that were due to readline and splitting by tab
        singleLine = line.replace('\r','').replace('\n','').split('\t')
        if '' in singleLine:
            singleLine.remove('')
        bigramsList.append(singleLine)
        # getting all the unigrams W(i-1)
        # if key exists then add the count of that unigram
        if singleLine[1] in unigramsDict:
            unigramsDict[singleLine[1]] += int(singleLine[0])
        else:
            unigramsDict[singleLine[1]] = int(singleLine[0])

    #print(bigramsList[:10]) #[['31', '<s>', 'should'], ['2', 'should', 'i'], ['1', 'i', 'get'], 
                             # ['1', 'get', 'bings'], ['1', 'bings', '', '</s>'], ['13491', '', '</s>', '<s>'], 
                             # ['592', '<s>', 'which'], ['17', 'which', 'tv'], ['1', 'tv', 'female'], 
                             # ['1', 'female', 'friend'], ...]
                
    #print(unigramsDict) # OrderedDict([('<s>', 13492), ('should', 498), ('i', 229), ('get', 258), ('bings', 1), 
                         #              ('', 13491), ('which', 700), ('tv', 95), ('female', 25), ... ])
    with open('bigram_json.txt', 'w') as outfile:
        json.dump(bigramsList, outfile)
    with open('unigramDict_json.txt', 'w') as outfile:
        json.dump(unigramsDict, outfile)

    #all the keys of a unigramsDict are unique unigrams, hence making a list
    unigramsList = [] # raw list of all unigrams
    for key in unigramsDict:
        unigramsList.append(key)
    #print(unigramsList[:10]) #['<s>', 'should', 'i', 'get', 'bings', '', 'which', 'tv', 'female', 'friend', ...]

    # OK so now you have a unigram list as well as bigram list with frequency.
    # Now calculating, for each bigram, its conditional probability for a its own unigram
    conditionalProbabilityDict = OrderedDict() # key:bigram , value:probability
    for bigram in bigramsList:
        firstWord = bigram[1]
        secondWord = bigram[2]
        count = int(bigram[0])
        cProb = count*1.0 / unigramsDict[firstWord] if unigramsDict[firstWord] > 0 else 0
        conditionalProbabilityDict[firstWord+" "+secondWord] = cProb
    
    unigramProbDict = OrderedDict()
    unigram_total = sum(unigramsDict.values())
    for unigram, count in unigramsDict.items():
        uProb = count/unigram_total
        unigramProbDict[unigram] = uProb
        #print(unigram, count, unigram_total, uProb)

    # print conditionalProbabilityDict
    with open(conditionalProbabilityFile,"wb") as file:
        pickle.dump(conditionalProbabilityDict,file)

    with open(bigramsListPath,"wb") as file:
        pickle.dump(bigramsList,file)
    
    with open(unigramProbFile, "wb") as file:
        pickle.dump(unigramProbDict, file)
        
    with open(unigramsDictPath, "wb") as file:
        pickle.dump(unigramsDict, file)
    
    #print(conditionalProbabilityDict) # OrderedDict([('<s> should', 0.0022976578713311594), 
                                       #              ('should i', 0.004016064257028112), 
                                       #              ('i get', 0.004366812227074236), 
                                       #              ('get bings', 0.003875968992248062), ('bings ', 1.0), ... ])
    
    #print(bigramsList) #[['31', '<s>', 'should'], ['2', 'should', 'i'], ['1', 'i', 'get'], 
                        # ['1', 'get', 'bings'], ['1', 'bings', '', '</s>'], ['13491', '', '</s>', '<s>'], 
                        # ['592', '<s>', 'which'], ['17', 'which', 'tv'], ['1', 'tv', 'female'], 
                        # ['1', 'female', 'friend'], ...]
                
    #print(unigramProbDict) #OrderedDict([('<s>', 0.08680323228164084), ('should', 0.003203973441762314), 
                            #             ('i', 0.0014733130886818674), ('get', 0.0016598898553708374), 
                            #             ('bings', 6.433681609964486e-06), ... ]) 
    
    #print(unigramsDict) # OrderedDict([('<s>', 13492), ('should', 498), ('i', 229), ('get', 258), ('bings', 1), 
                         #              ('', 13491), ('which', 700), ('tv', 95), ('female', 25), ... ])\
    

In [4]:
import pickle
from collections import OrderedDict
import heapq # for getting top 5

def getTopBigram():
    conditionalProbabilityFile = "conditionalProbabilityDict.p"
    bigramsListPath = "bigramsList.p"
    unigramProbFile = "unigramProbDict.p"
    unigramsDictPath = "unigramsDict.p"

    with open(conditionalProbabilityFile,"rb") as file:
        conditionalProbabilityDict = pickle.load(file)

    with open(bigramsListPath,"rb") as file:
        bigramsList = pickle.load(file)
    
    with open(unigramProbFile,"rb") as file:
        unigramProbDict = pickle.load(file)

    with open(unigramsDictPath,"rb") as file:
        unigramsDict = pickle.load(file)
    
    while True:
        sentence = input("Enter a sentence to predict whether it is clickbait or not ('stop' for stopping) : \n")

        if sentence == "stop":
            break;
        newSentence = ["<s>"] + sentence.lower().split() + ["</s>"]
        lengthOfSentence = len(newSentence) - 2
        
        wordsOf2 = []
        totalProb = unigramProbDict["<s>"]
        for index, word in enumerate(newSentence):
            if word != "</s>":
                wordsOf2.append(word + " " + newSentence[index+1])
        end = 0
        for bigram in wordsOf2:
            if bigram in conditionalProbabilityDict:
                totalProb *= conditionalProbabilityDict[bigram]
            else:
                print(bigram)
                print("TODO: needs smoothing")
                end = 1
        if end == 1:
            break
        print("\n\nProbability of:\n" + "\t" + sentence + "\nbeing clickbait is:", totalProb)

        # empty the list, for new iteration
        #matchedBigrams = [] # all bigrams that starts with the inputted word
        #for bigram in bigramsList:
        #    if sentence == bigram[1]:
        #        matchedBigrams.append(bigram[1]+" "+bigram[2])

        # print matchedBigrams
        #topDict = {}
        #for singleBigram in matchedBigrams:
        #    print(singleBigram, type(singleBigram), conditionalProbabilityDict[singleBigram], type(conditionalProbabilityDict[singleBigram]))
        #    topDict[singleBigram] = conditionalProbabilityDict[singleBigram]

        #topBigrams = heapq.nlargest(5, topDict, key=topDict.get)
        #for b in topBigrams:
        #    print(b + " : "+str(topDict[b])+"\n")

        print("\n" + "____________________" + "\n")

In [5]:
clickbait_dataFiles = ["train/clickbait_data1"]
non_clickbait_dataFiles = ["non_clickbait_data1", "non_clickbait_data2", "non_clickbait_data3", "non_clickbait_data4", "non_clickbait_data5"]
fileName = ["model1.txt", "model3.txt"]
language_m(clickbait_dataFiles)
getProb(fileName[-1])
getTopBigram()



Ngram: 1

Train: train/clickbait_data1
Perplexity: 584.5619719069464 	(on Test:train/clickbait_data1)


Ngram: 2

Train: train/clickbait_data1
Perplexity: 702.3791240615541 	(on Test:train/clickbait_data1)


Ngram: 3

Train: train/clickbait_data1
Perplexity: 2156.956516117808 	(on Test:train/clickbait_data1)
Enter a sentence to predict whether it is clickbait or not ('stop' for stopping) : 
If Disney Princesses Were From Florida


Probability of:
	If Disney Princesses Were From Florida
being clickbait is: 4.5151277394698045e-13

____________________

Enter a sentence to predict whether it is clickbait or not ('stop' for stopping) : 
Who Is Your Celebrity Ex Based On Your Zodiac


Probability of:
	Who Is Your Celebrity Ex Based On Your Zodiac
being clickbait is: 1.022632013180572e-13

____________________

Enter a sentence to predict whether it is clickbait or not ('stop' for stopping) : 
How To Be A Genderqueer Feminist
a genderqueer
TODO: needs smoothing
genderqueer feminist
TODO: n