In [None]:
import numpy as np
import re

# Task 1

corpus = sc.textFile("s3://chrisjermainebucket/comp330_A5/TrainingDataOneLinePerDoc.txt")
validLines = corpus.filter(lambda x: 'id' in x)

keyAndText = validLines.map(lambda x: (x[x.index('id="') + 4: x.index('" url=')], x[x.index('">') + 2:]))

regex = re.compile('[^a-zA-Z]')
keyAndListOfWords = keyAndText.map(lambda x: (str(x[0]), regex.sub(' ', x[1]).lower().split()))

allWords = keyAndListOfWords.flatMap(lambda x: ((j, 1) for j in x[1]))
allCounts = allWords.reduceByKey(lambda a, b: a + b)

topWords = allCounts.top(20000, lambda x: x[1])
twentyK = sc.parallelize(range(20000))

dictionary = twentyK.map(lambda x: (topWords[x][0], x))

mapDict = dictionary.collectAsMap()

print('applicant:', mapDict['applicant'])
print('and:', mapDict['and'])
print('attack:', mapDict['attack'])
print('protein:', mapDict['protein'])
print('car:', mapDict['car'])

In [None]:
# Task 2
numDocs = keyAndListOfWords.keys().count()

def docWordCounts(doc):
    id, wordList = doc
    wordCounts = np.zeros(20000)
    for word in wordList:
        if word in mapDict:
            order = mapDict[word]
            wordCounts[order] += 1
    return (id, wordCounts)

docWords = keyAndListOfWords.map(docWordCounts)

def binaryFreq(doc):
    id, wordCounts = doc
    binary = np.array([1 if i > 0 else 0 for i in wordCounts])
    return (id, binary)

binaries = docWords.map(binaryFreq)
freq = binaries.map(lambda x: x[1]).reduce(lambda a, b: a + b)
idf = np.log(numDocs / freq)

def tfIDF(doc):
    id, wordCounts = doc
    tf = wordCounts / np.sum(wordCounts)
    tf_idf = np.array(tf * idf)
    return (id, tf_idf)

tfVectors = docWords.map(tfIDF)
tfVectors.cache()

In [None]:
# Regularize the TF IDF Vectors

# Get mean vector
tfs = tfVectors.map(lambda x: x[1])
summed_tf = tfs.reduce(lambda a,b: a + b)
mean_tf = np.array([val/numDocs for val in summed_tf])

# Get std vector
# Use square differences to solve for std vector
def squareDiff(vec):
    arr = np.array(vec)
    return np.square(arr)

diff_tf = tfs.map(lambda x: x - mean_tf)

squared_tf = diff_tf.map(lambda x: squareDiff(x))

total_tf = squared_tf.reduce(lambda a, b: a + b)

std_tf = np.sqrt(total_tf / numDocs)

def regularize(doc):
    docID, tfVect = doc
    temp1 = tfVect - mean_tf
        
    reg = temp1 / std_tf
    
    reg[std_tf == 0.0] = 0.0
    
    label = 0
    
    if docID[:2] == 'AU':
        label = 1
    
    return (label, reg)

regularVect = tfVectors.map(lambda x: regularize(x))

regularVect.cache()

In [None]:
# Task 3) Gradient Descent

# divide by numDocs for NaN issues
r = np.zeros(20000)
z = 0.0001

def llh(doc):
    y, x = doc
    theta = np.dot(x, r)
    yTheta = y * theta
    logTerm = np.log(1 + np.exp(theta))
    return np.sum(yTheta - logTerm) / numDocs
 
def gradientF(doc):
    y, x = doc
    gradient = np.zeros(20000)
    theta = np.dot(x, r)
    last = 2 * z * r
    sig = np.exp(theta) / (1 + np.exp(theta))
    total = -1 * x * y + np.dot(x.T, sig)
    gradient += total
    gradient += (2 * z * r)
    return gradient / numDocs

In [None]:
# Sample gradient 
sampleVect = regularVect.sample(False, 0.1)

rate = 0.1

currSampleLoss = sampleVect.map(lambda x: -1 * llh(x)).reduce(lambda a,b: a + b)
prevSampleLoss = 1000

while abs(currSampleLoss - prevSampleLoss) > 10e-4:
    
    prevSampleLoss = currSampleLoss
    sample_gradient = sampleVect.map(lambda x: gradientF(x)).reduce(lambda a,b: a + b)
    r = r - rate * sample_gradient
    
    regSampleTerm = z * np.sum(np.square(r))
    currSampleLoss = sampleVect.map(lambda x: -1 * llh(x)).reduce(lambda a,b: a + b) + regSampleTerm
    
    if currSampleLoss > prevSampleLoss: 
        rate = rate * 0.5
    else:
        rate = rate * 1.1
    
    print(currSampleLoss)

In [None]:
currLoss = regularVect.map(lambda x: -1 * llh(x)).reduce(lambda a,b: a + b)
prevLoss = 1

while abs(currLoss - prevLoss) > 10e-4:
    
    prevLoss = currLoss
    gradient = regularVect.map(lambda x: gradientF(x)).reduce(lambda a,b: a + b)
    r = r - rate * gradient
    
    regTerm = z * np.sum(np.square(r))
    currLoss = regularVect.map(lambda x: -1 * llh(x)).reduce(lambda a,b: a + b) + regTerm
    
    if currLoss > prevLoss: 
        rate = rate * 0.5
    else:
        rate = rate * 1.1
    
    print(currLoss)


In [None]:
print(r)

In [None]:
indices = np.argsort(r)[-50:]
words = [key for key, value in mapDict.items() if value in indices]
print(words)

In [None]:
testingData = sc.textFile("s3://chrisjermainebucket/comp330_A5/TestingDataOneLinePerDoc.txt")
testLines = testingData.filter(lambda x: 'id' in x)
testkeyAndText = testLines.map(lambda x: (x[x.index('id="') + 4: x.index('" url=')], x[x.index('">') + 2:]))

regex = re.compile('[^a-zA-Z]')
testWords = testkeyAndText.map(lambda x: (str(x[0]), regex.sub(' ', x[1]).lower().split()))

testDoc = testWords.map(docWordCounts)
testBin = testDoc.map(binaryFreq)
testFreq = testBin.map(lambda x: x[1]).reduce(lambda a, b: a + b)

testTFIDF = testDoc.map(tfIDF)

def testRegularize(doc):
    docID, tfVect = doc
    temp1 = tfVect - mean_tf
        
    reg = temp1 / (10 * std_tf)
    
    reg[std_tf == 0.0] = 0.0
    
    label = 0
    
    if docID[:2] == 'AU':
        label = 1
    
    return (docID, label, reg)

testReg = testTFIDF.map(lambda x: testRegularize(x))


def predictLabel(doc):
    
    docID, y, x = doc
    
    numActuallyTrue = 0 # denominator for recall
    numWeSayTrue = 0 # denominator for precision
    
    success = 0 # numerator for precision and recall
    
    sucNeg = 0
    
    falsePos = 0 # need to find documents that were false positives
    
    falseNeg = 0 # not really needed
    
    cutoff = 0.6
    
    if np.dot(x, r) > cutoff:
        numWeSayTrue = 1
    
    if y > cutoff:
        numActuallyTrue = 1
    
    if (np.dot(x, r) > cutoff) and (y > cutoff):
        success = 1
        
    elif (np.dot(x, r) < cutoff) and (y < cutoff):
        sucNeg = 1

    elif (np.dot(x, r) > cutoff) and (y < cutoff):
        falsePos = 1
        
    else:
        falseNeg = 1
    
    return (docID, success, numActuallyTrue, numWeSayTrue, falsePos, np.dot(x,r), y)

testResults = testReg.map(lambda x: predictLabel(x)) 

In [None]:
# find F1

numSuccesses = testResults.map(lambda x: x[1]).reduce(lambda a,b: a + b)
numActuallyTrues = testResults.map(lambda x: x[2]).reduce(lambda a,b: a + b)
numWeSayTrues = testResults.map(lambda x: x[3]).reduce(lambda a,b: a + b)

print(numWeSayTrues, numActuallyTrues, numSuccesses)

In [None]:
precision = numSuccesses / numWeSayTrues
recall = numSuccesses / numActuallyTrues

F1 = (2 * precision * recall) / (precision + recall)

print("F1:",F1)
print("Precision:", precision)
print("Recall:", recall)

In [None]:
numFP = testResults.map(lambda x: x[4]).reduce(lambda a,b: a + b)
print(numFP)

In [None]:
def getFP(doc):
    docID, success, numActuallyTrue, numWeSayTrue, falsePos, val, y = doc
    return 
    
fpDocs = testResults.filter(lambda x: x[4] > 0).map(lambda x: x[0]).take(3)

In [None]:
print(fpDocs)

In [None]:
print(testWords.lookup('35797415'))

In [None]:
print(testWords.lookup('3470592'))

In [None]:
print(testWords.lookup('19505797'))