# Project Clickbait Detection

### Team members: Joshua Burris, Caleb Tong

In [1]:
import math
import string
from collections import Counter
class language_model:
    def __init__(self, ngram=1) :
        """
        Initialize a language model
        
        Parameters:
        ngram specifies the type of model:  
        unigram (ngram = 1), bigram (ngram = 2) etc.
        """
        self.ngram = ngram
        
    def train(self, file_name) :
        self.story = self.clean_text(file_name)
        if self.ngram > 1:
            self.bigram = []
            for i in range(len(self.story) - 1):
                self.bigram.append(self.story[i] + ' ' + self.story[i+1])
            self.bigram = Counter(self.bigram)
        if self.ngram > 2:
            self.trigram = []
            for i in range(len(self.story) - self.ngram + 1):
                temp = self.story[i]
                for j in range(1, self.ngram):
                    temp += ' ' + self.story[i+j]
                self.trigram.append(temp)
            self.trigram = Counter(self.trigram)
        self.data_frequency = Counter(self.story)
        self.V = len(self.data_frequency)
        self.total_count = sum(self.data_frequency.values())
        #print(self.total_count, self.V, self.data_frequency)
    
    def test(self, file_name) :
        text = self.clean_text(file_name)
        
        non_entries, entries = 0, 0
        for i in range(len(text) - self.ngram + 1):
            temp = text[i]
            for j in range(1, self.ngram):
                temp += ' ' + text[i+j]
            data = {}
            if self.ngram == 1:
                data = self.data_frequency
            elif self.ngram == 2:
                data = self.bigram
            elif self.ngram == 3:
                data = self.trigram
            if data.setdefault(temp, 0) == 0:
                non_entries += 1
            entries += 1
        
        self.sparsity = non_entries / entries
        
        return self.perplexity(text)
    
    def probability(self, word1, words):
        if self.ngram == 1:
            return (self.C([word1]) + 1) / (self.total_count + self.V)
        else:
            return (self.C(words + [word1]) + 1) / (self.C(words) + self.V)
    
    def perplexity(self, text):
        return math.pow(2, self.entropy(text))
    
    def entropy(self, text):
        exp = 0
        for i in range(self.ngram - 1, len(text)) :
            prevW = text[i - self.ngram + 1 : i]
            exp += -math.log(self.probability(text[i], prevW), 2)     
        return exp / (len(text) - (self.ngram - 1))
    
    def C(self, words):
        size = len(words)
        words = ' '.join(words)
        if size == 1: return self.data_frequency.setdefault(words, 0)
        if size == 2: return self.bigram.setdefault(words, 0)
        if size == 3: return self.trigram.setdefault(words, 0)
        
        return None
    
    def clean_text(self, file_name):
        result = []
        with open(file_name, 'r') as f:
            text = f.read()
            text = text.lower()
            result = []
            trantab = str.maketrans("?:!-", "... ")
            text = text.translate(trantab)
            trantab = str.maketrans('', '', string.punctuation.replace('.', ''))
            text = text.translate(trantab)
            text = text.replace('\n\n', '.')
            tokens = text.split('.')
            for token in tokens:
                result += ['<s>'] + token.split() + [' </s>']
        return result

In [2]:
def language_m(textFiles):
    for i in range(1, 4):
        print('\n\nNgram:', i)
        model = language_model(i)
        for file1 in textFiles:
            model.train(file1)
            print('\nTrain:', file1)
            for file2 in textFiles:
                print('Perplexity:', model.test(file2), '\t(on Test:' + file2 + ')')
            #print(model.story)
            if i > 1:
                outfile = 'model'+str(i)+'.txt'
                with open(outfile, encoding='utf-8', mode='w') as fp: 
                    for tag, count in model.bigram.items():  
                        fp.write('{}\t{}\n'.format(count, ''.join(tag.replace(' ', '\t'))))

In [3]:
import pickle, os, json
from collections import OrderedDict

def getProb(fileName):
    corpusPath = ""
    conditionalProbabilityFile = "conditionalProbabilityDict.p"
    bigramsListPath = "bigramsList.p"

    with open(corpusPath+fileName, encoding = "ISO-8859-1") as f:
        lines = f.readlines()

    bigramsList = [] # List of all bigrams along with counts. [ ["24","hello","world"] , [ ... ], ...]
    unigramsDict = OrderedDict() # key : unigram, value : count
    singleLine = [] # a temporary variable

    for line in lines:
        # removing \n and \r that were due to readline and splitting by tab
        singleLine = line.replace('\r','').replace('\n','').split('\t')
        bigramsList.append(singleLine)
        # getting all the unigrams W(i-1)
        # if key exists then add the count of that unigram
        if singleLine[1] in unigramsDict:
            unigramsDict[singleLine[1]] += int(singleLine[0])
        else:
            unigramsDict[singleLine[1]] = int(singleLine[0])

    #print(bigramsList)
    with open('bigram_json.txt', 'w') as outfile:
        json.dump(bigramsList, outfile)
    with open('unigramDict_json.txt', 'w') as outfile:
        json.dump(unigramsDict, outfile)
    #print(unigramsDict)

    #all the keys of a unigramsDict are unique unigrams, hence making a list
    unigramsList = [] # raw list of all unigrams
    for key in unigramsDict:
        unigramsList.append(key)

    # print unigramsList

    # OK so now you have a unigram list as well as bigram list with frequency.
    # Now calculating, for each bigram, its conditional probability for a its own unigram
    conditionalProbabilityDict = OrderedDict() # key:bigram , value:probability
    for bigram in bigramsList:
        firstWord = bigram[1]
        secondWord = bigram[2]
        count = int(bigram[0])
        cProb = count*1.0 / unigramsDict[firstWord] if unigramsDict[firstWord] > 0 else 0
        conditionalProbabilityDict[firstWord+" "+secondWord] = cProb

    # print conditionalProbabilityDict
    file = open(conditionalProbabilityFile,"wb")
    pickle.dump(conditionalProbabilityDict,file)

    file = open(bigramsListPath,"wb")
    pickle.dump(bigramsList,file)

In [4]:
import pickle
from collections import OrderedDict
import heapq # for getting top 5

def getTopBigram():
    conditionalProbabilityFile = "conditionalProbabilityDict.p"
    bigramsListPath = "bigramsList.p"

    file = open(conditionalProbabilityFile,"rb")
    conditionalProbabilityDict = pickle.load(file)

    file = open(bigramsListPath,"rb")
    bigramsList = pickle.load(file)

    while True:
        checkForThisBigram = input("Enter a word to predict its next probable words ('stop' for stopping) : ")

        if checkForThisBigram == "stop":
            break;

        # empty the list, for new iteration
        matchedBigrams = [] # all bigrams that starts with the inputted word
        for bigram in bigramsList:
            if checkForThisBigram == bigram[1]:
                matchedBigrams.append(bigram[1]+" "+bigram[2])

        # print matchedBigrams
        topDict = {}
        for singleBigram in matchedBigrams:
            topDict[singleBigram] = conditionalProbabilityDict[singleBigram]

        topBigrams = heapq.nlargest(5, topDict, key=topDict.get)
        for b in topBigrams:
            print(b + " : "+str(topDict[b])+"\n")

        print("\n" + "____________________" + "\n")

In [6]:
clickbait_dataFiles = ["clickbait_data1"]
non_clickbait_dataFiles = ["non_clickbait_data1", "non_clickbait_data2", "non_clickbait_data3", "non_clickbait_data4", "non_clickbait_data5"]
fileName = ["model2.txt", "model3.txt"]
language_m(clickbait_dataFiles)
getProb(fileName[1])
getTopBigram()



Ngram: 1

Train: clickbait_data1
Perplexity: 584.5619719069464 	(on Test:clickbait_data1)


Ngram: 2

Train: clickbait_data1
Perplexity: 702.3791240615541 	(on Test:clickbait_data1)


Ngram: 3

Train: clickbait_data1
Perplexity: 2156.956516117808 	(on Test:clickbait_data1)
Enter a word to predict its next probable words ('stop' for stopping) : how
how well : 0.23160762942779292

how to : 0.15803814713896458

how much : 0.0885558583106267

how many : 0.0667574931880109

how you : 0.04632152588555858


____________________

Enter a word to predict its next probable words ('stop' for stopping) : to
to be : 0.05602782071097372

to make : 0.05564142194744977

to the : 0.044435857805255025

to know : 0.03207109737248841

to get : 0.030525502318392583


____________________

Enter a word to predict its next probable words ('stop' for stopping) : flirt
flirt with : 0.5

flirt on : 0.5


____________________

Enter a word to predict its next probable words ('stop' for stopping) : stop
