In [132]:
import gzip
import random
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import time
import scipy
import scipy.optimize
from sklearn import linear_model
import statistics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline() # Skip Header
    for l in f:
        yield l.strip(). split(',')

In [4]:
bookData = []
for user, book, rating in readCSV("train_Interactions.csv.gz"):
    bookData.append([user, book, int(rating)])

In [5]:
bookData[:10]

[['u79354815', 'b14275065', 4],
 ['u56917948', 'b82152306', 5],
 ['u97915914', 'b44882292', 5],
 ['u49688858', 'b79927466', 5],
 ['u08384938', 'b05683889', 2],
 ['u13530776', 'b86375465', 4],
 ['u46307273', 'b92838791', 5],
 ['u18524450', 'b35165110', 2],
 ['u69700998', 'b17128180', 5],
 ['u43359569', 'b34596567', 5]]

## Tasks (Read prediction)

### 1. Although we have built a validation set, it only consists of positive samples.
For this task we also need examples of user/item pairs that weren’t read. For each entry (user,book) in the validation set, sample a negative entry by randomly choosing a book that user hasn’t read. Evaluate the performance (accuracy) of the baseline model on the validation set you have built.

Ans: <br>
Accurarcy of validation set: 0.7484

In [6]:
# Separate traing and validation set
print(len(bookData))
numTrainSet = 190000
bookDataTrain = bookData[:numTrainSet]
bookDataYTrain =  [1] * len(bookDataTrain)
bookDataValidPos = bookData[numTrainSet:]
bookDataYValidPos = [1] * len(bookDataValidPos)
print(len(bookDataTrain))
print(len(bookDataYTrain))
print(len(bookDataValidPos))

200000
190000
190000
10000


In [7]:
bookUniqueIds = set()
for user, book, rating in bookData:
    bookUniqueIds.add(book)
bookUniqueIdsList = list(bookUniqueIds)
#bookUniqueIdsList
len(bookUniqueIdsList)

7170

In [44]:
userUniqueIds = set()
for user, book, rating in bookData:
    userUniqueIds.add(user)
userUniqueIdsList = list(userUniqueIds)
#bookUniqueIdsList
len(userUniqueIdsList)

11357

In [8]:
bookReadByUserIds = {}
for user, book, rating in bookData:
    if user in bookReadByUserIds:
        bookReadByUserIds[user].add(book)
    else:
        bookReadByUserIds[user] = set()
        bookReadByUserIds[user].add(book)
#bookReadByUserIds

In [9]:
def getNegativeEntries():
    bookDataValidNeg = []
    bookDataYValidNeg = [0] * len(bookDataValidPos)
    
    for user, book, rating in bookDataValidPos:
        #while True:
            #unreadBookId = random.choice(bookUniqueIdsList)
        # For consistent validation set
        for unreadBookId in bookUniqueIdsList:
            if unreadBookId not in bookReadByUserIds[user]:
                bookDataValidNeg.append([user, unreadBookId, "-1"])
                break
    return bookDataValidNeg, bookDataYValidNeg

In [48]:
def getNegativeEntries(numNegativeEntries):
    bookDataNeg = []
    bookDataYNeg = [0] * numNegativeEntries
    
    for i in range(numNegativeEntries):
        # Randomly select a user
        userId = random.choice(userUniqueIdsList)

        # Randomly select a haven't read book
        while True:
            unreadBookId = random.choice(bookUniqueIdsList)
            if unreadBookId not in bookReadByUserIds[userId]:
                bookDataNeg.append([userId, unreadBookId, "-1"])
                break

    return bookDataNeg, bookDataYNeg

In [10]:
bookDataValidNeg, bookDataYValidNeg = getNegativeEntries()
bookDataValid = bookDataValidPos + bookDataValidNeg
bookDataYValid = bookDataYValidPos + bookDataYValidNeg
print(len(bookDataValid))
print(len(bookDataYValid))

20000
20000


In [11]:
def writeOutValidationSet(nameTag):
    timestr = time.strftime("%Y%m%d_%H%M%S")
    fileName = "validation_set_" + timestr + "_validMSE_" + nameTag + ".txt"
    print("FileName: %s" % (fileName))
    outFile = open(fileName, 'w')
    
    # Write out current validation set
    for data, y in zip(bookDataValid, bookDataYValid):
        outFile.write(data[0] + '-' + data[1] + "," + str(y) + "\n")

    outFile.close()

In [12]:
def getBaselinePred(Xdata, threshold):
    ### Would-read baseline: just rank which books are popular and which are not, and return '1' if a book is among the top-ranked
    bookCount = defaultdict(int)
    totalRead = 0
    
    for user, book, _ in readCSV("train_Interactions.csv.gz"):
        bookCount[book] += 1
        totalRead += 1
        
    mostPopular = [(bookCount[x], x) for x in bookCount]
    mostPopular.sort()
    mostPopular.reverse()
    
    return1 = set()
    count = 0
    for bkc, bkId in mostPopular:
        count += bkc
        return1.add(bkId)
        if count > totalRead/threshold: break
            
    # Make prediction
    prediction = []
    for uId, bId, rating in Xdata:
        if bId in return1:
            prediction.append(1)
        else:
            prediction.append(0)
    
    return prediction

In [64]:
def getPercentile():
    bookCount = defaultdict(int)
    totalRead = 0
    bookPercentile = defaultdict(float)
    
    for user, book, _ in readCSV("train_Interactions.csv.gz"):
        bookCount[book] += 1
        totalRead += 1
        
    mostPopular = [(bookCount[x], x) for x in bookCount]
    mostPopular.sort()
    mostPopular.reverse()
    print(mostPopular[:10])
    
    count = 0
    for bkc, bkId in mostPopular:
        bookPercentile[bkId] = float(count)/float(totalRead)
        count += bkc
    
    return bookPercentile

In [65]:
bookPercentile = getPercentile()

[(402, 'b25543219'), (333, 'b21517939'), (331, 'b76915592'), (285, 'b55315814'), (280, 'b02830492'), (279, 'b75885962'), (274, 'b52453648'), (264, 'b25118404'), (262, 'b39244888'), (249, 'b87250311')]


In [79]:
def getPopularity():
    bookCount = defaultdict(int)
    bookPopularity = defaultdict(float)
    
    for user, book, _ in readCSV("train_Interactions.csv.gz"):
        bookCount[book] += 1
        
    mostPopular = [(bookCount[x], x) for x in bookCount]
    mostPopular.sort()
    mostPopular.reverse()
    
    maxReadCount = mostPopular[0][0]
    minReadCount = mostPopular[-1][0]
    
    for bkc, bkId in mostPopular:
        bookPopularity[bkId] = float(bkc-minReadCount)/float(maxReadCount-minReadCount)
    
    return bookPopularity

In [83]:
bookPopularity = getPopularity()

In [15]:
def getAcc(pred, golden):
    correctPredictions = [p==y for p, y in zip(pred, golden)]
    return sum(correctPredictions) / len(golden)

In [16]:
def getTPR(pred, golden):
    TP = sum([(p and l) for (p,l) in zip(pred, golden)])
    FN = sum([(not p and l) for (p,l) in zip(pred, golden)])
    return TP / (TP + FN)

In [17]:
def getTNR(pred, golden):
    FP = sum([(p and not l) for (p,l) in zip(pred, golden)])
    TN = sum([(not p and not l) for (p,l) in zip(pred, golden)])
    return TN / (TN + FP)

In [18]:
def getMetrics(pred, golden):
    TNR = getTNR(pred, golden)
    TPR = getTPR(pred, golden)
    acc = getAcc(pred, golden)
    return (acc, TPR, TNR)

In [19]:
predBookDataYValid = getBaselinePred(bookDataValid, 2.0)
print(len(predBookDataYValid))
print(len(bookDataYValid))

# Accurarcy
predBookDataValidMSE = getMetrics(predBookDataYValid, bookDataYValid)
print("acc=%f, TPR=%f, TNR=%f" % (predBookDataValidMSE[0], predBookDataValidMSE[1], predBookDataValidMSE[2]) )

20000
20000
acc=0.748400, TPR=0.496800, TNR=1.000000


In [20]:
writeOutValidationSet(str(predBookDataValidMSE[0]))

FileName: validation_set_20191116_121125_validMSE_0.7484.txt


### 2. The existing ‘read prediction’ baseline just returns True if the item in question is ‘popular,’ using a threshold of the 50th percentile of popularity (totalRead/2).
Assuming that the ‘non-read’ test examples are a random sample of user-book pairs, this threshold may not be the best one. See if you can find a better threshold and report its performance on your validatin set.

Ans: <br>
Threshold = 1.250000, i.e. 80th percentile of popularity <br>
Accurarcy on validation set: 0.899000 <br>

In [35]:
for thres in np.arange(1, 3, 0.05):
    #predBookDataYTrain = getBaselinePred(bookDataTrain, thres)
    # Accurarcy for training set
    #correctPredictions = [p==y for p, y in zip(predBookDataYTrain, bookDataYTrain)]
    #print("Training: t=%f, acc=%f" % (thres, sum(correctPredictions) / len(bookDataYTrain)) )
    
    predBookDataYValid = getBaselinePred(bookDataValid, thres)
    # Accurarcy for validation set
    acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
    print("Validataion: t=%f, percentile=%.2f acc=%f, TPR=%f, TNR=%f" % (thres, 1.0/thres, acc, TPR, TNR) )

Validataion: t=1.000000, percentile=1.00 acc=0.500000, TPR=1.000000, TNR=0.000000
Validataion: t=1.050000, percentile=0.95 acc=0.478050, TPR=0.954800, TNR=0.001300
Validataion: t=1.100000, percentile=0.91 acc=0.454450, TPR=0.907600, TNR=0.001300
Validataion: t=1.150000, percentile=0.87 acc=0.433650, TPR=0.866000, TNR=0.001300
Validataion: t=1.200000, percentile=0.83 acc=0.915750, TPR=0.831500, TNR=1.000000
Validataion: t=1.250000, percentile=0.80 acc=0.899000, TPR=0.798000, TNR=1.000000
Validataion: t=1.300000, percentile=0.77 acc=0.881850, TPR=0.763700, TNR=1.000000
Validataion: t=1.350000, percentile=0.74 acc=0.866700, TPR=0.733400, TNR=1.000000
Validataion: t=1.400000, percentile=0.71 acc=0.853350, TPR=0.706700, TNR=1.000000
Validataion: t=1.450000, percentile=0.69 acc=0.841500, TPR=0.683000, TNR=1.000000
Validataion: t=1.500000, percentile=0.67 acc=0.829650, TPR=0.659300, TNR=1.000000
Validataion: t=1.550000, percentile=0.65 acc=0.818400, TPR=0.636800, TNR=1.000000
Validataion: t=1

In [24]:
def writeOutBaselinePred(threshold):
    ### Would-read baseline: just rank which books are popular and which are not, and return '1' if a book is among the top-ranked
    bookCount = defaultdict(int)
    totalRead = 0
    
    for user, book, _ in readCSV("train_Interactions.csv.gz"):
        bookCount[book] += 1
        totalRead += 1
    
    mostPopular = [(bookCount[x], x) for x in bookCount]
    mostPopular.sort()
    mostPopular.reverse()
    
    return1 = set()
    count = 0
    for bkc, bkId in mostPopular:
        count += bkc
        return1.add(bkId)
        if count > totalRead/threshold: break
    
    fileName = "predictions_Read_" + str(threshold) + ".txt"
    predOutFile = open(fileName, 'w')
    for l in open("pairs_Read.txt", 'r'):
        if l.startswith("userID"):
            #header
            predOutFile.write(l)
            continue
        uId, bId = l.strip().split('-')
        if bId in return1:
            predOutFile.write(uId + '-' + bId + ",1\n")
        else:
            predOutFile.write(uId + '-' + bId + ",0\n")
    predOutFile.close()

In [34]:
writeOutBaselinePred(1.55)

### 3. A stronger baseline than the one provided might make use of the Jaccard similarity (or another similarity metric). Given a pair (u, b) in the validation set, consider all training items b′ that user u has read.
For each, compute the Jaccard similarity between b and b′, i.e., users (in the training set) who have read ′
b and users who have read b . Predict as ‘read’ if the maximum of these Jaccard similarities exceeds a threshold (you may choose the threshold that works best). Report the performance on your validation set (1 mark).

Ans: <br>
Choose the threshold with the best accurarcy on validation set: threshold = 0.010000<br>
Accurarcy on the validation set: 0.689850 <br>

In [174]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)

In [175]:
bookDataTrain[:10]

[['u79354815', 'b14275065', 4],
 ['u56917948', 'b82152306', 5],
 ['u97915914', 'b44882292', 5],
 ['u49688858', 'b79927466', 5],
 ['u08384938', 'b05683889', 2],
 ['u13530776', 'b86375465', 4],
 ['u46307273', 'b92838791', 5],
 ['u18524450', 'b35165110', 2],
 ['u69700998', 'b17128180', 5],
 ['u43359569', 'b34596567', 5]]

In [176]:
#for d in bookDataTrain:
for d in bookData:
    usersPerItem[d[1]].add(d[0])
    itemsPerUser[d[0]].add(d[1])

In [130]:
#bookUniqueIdsList
#userUniqueIdsList
usersPerItemR = defaultdict(list)
itemsPerUserR = defaultdict(list)

for bookId in bookUniqueIdsList:
    usersPerItemR[bookId] = [0]*len(userUniqueIdsList)

for userId in userUniqueIdsList:
    itemsPerUserR[userId] = [0]*len(bookUniqueIdsList)
    
for userId, bookId, r in bookDataTrain:
    usersPerItemR[bookId][userUniqueIdsList.index(userId)] = r
    itemsPerUserR[userId][bookUniqueIdsList.index(bookId)] = r

In [157]:
def CosineSim(v1, v2):
    # vectors
    a = np.array(v1)
    b = np.array(v2)

    # manually compute cosine similarity
    #dot = np.dot(a, b)
    #norma = np.linalg.norm(a)
    #normb = np.linalg.norm(b)
    #cos = dot / (norma * normb)
    
    aa = a.reshape(1,len(v1))
    ba = b.reshape(1,len(v2))
    cos_lib = cosine_similarity(aa, ba)
    
    return cos_lib

In [24]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [142]:
def pairSimilarity(u, b):
    similarities = []
    users = usersPerItem[b]
    candidateItems = itemsPerUser[u]
    for b2 in candidateItems:
        if b2 == b: continue
        sim = Jaccard(users, usersPerItem[b2])
        similarities.append((sim,b2))
    similarities.sort(reverse=True)
    return similarities

In [143]:
def pairCosineSimilarity(u, b):
    similarities = []
    candidateItems = itemsPerUser[u]
    for b2 in candidateItems:
        if b2 == b: continue
        sim = CosineSim(usersPerItemR[b], usersPerItemR[b2])
        similarities.append((sim,b2))
    similarities.sort(reverse=True)
    return similarities

In [144]:
def pairSimilarityByUser(u, b):
    similarities = []
    items = itemsPerUser[u]
    candidateUsers = usersPerItem[b]
    for u2 in candidateUsers:
        if u2 == u: continue
        sim = Jaccard(items, itemsPerUser[u2])
        similarities.append((sim,u2))
    similarities.sort(reverse=True)
    return similarities

In [145]:
def pairCosineSimilarityByUser(u, b):
    similarities = []
    candidateUsers = usersPerItem[b]
    for u2 in candidateUsers:
        if u2 == u: continue
        sim = CosineSim(itemsPerUserR[u], itemsPerUserR[u2])
        similarities.append((sim,u2))
    similarities.sort(reverse=True)
    return similarities

In [146]:
def getJaccardPred(Xdata, threshold):      
    # Make prediction
    prediction = []
    for uId, bId, rating in Xdata:
        #print("Query: userId: %s, bookId: %s" % (uId, bId))
        #print(itemsPerUser[uId])
        sim = pairSimilarity(uId, bId)
        #print(sim[0][0])
        
        if sim and sim[0][0] > threshold:
            prediction.append(1)
        else:
            prediction.append(0)
    return prediction

In [84]:
for thres in np.arange(0, 0.03, 0.001):
    predBookDataYValid = getJaccardPred(bookDataValid, thres)
    # Accurarcy for validation set
    acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
    print("Validataion: t=%f, acc=%f, TPR=%f, TNR=%f" % (thres, acc, TPR, TNR) )

Validataion: t=0.000000, acc=0.665700, TPR=0.920400, TNR=0.411000
Validataion: t=0.001000, acc=0.665700, TPR=0.920400, TNR=0.411000
Validataion: t=0.002000, acc=0.665700, TPR=0.920400, TNR=0.411000
Validataion: t=0.003000, acc=0.671850, TPR=0.919100, TNR=0.424600
Validataion: t=0.004000, acc=0.670100, TPR=0.915600, TNR=0.424600
Validataion: t=0.005000, acc=0.672850, TPR=0.908800, TNR=0.436900
Validataion: t=0.006000, acc=0.677800, TPR=0.898900, TNR=0.456700
Validataion: t=0.007000, acc=0.682100, TPR=0.885500, TNR=0.478700
Validataion: t=0.008000, acc=0.686800, TPR=0.869300, TNR=0.504300
Validataion: t=0.009000, acc=0.689350, TPR=0.848700, TNR=0.530000
Validataion: t=0.010000, acc=0.689850, TPR=0.819700, TNR=0.560000
Validataion: t=0.011000, acc=0.681400, TPR=0.788900, TNR=0.573900
Validataion: t=0.012000, acc=0.679250, TPR=0.756100, TNR=0.602400
Validataion: t=0.013000, acc=0.667650, TPR=0.713700, TNR=0.621600
Validataion: t=0.014000, acc=0.654550, TPR=0.674100, TNR=0.635000
Validataio

In [42]:
def writeOutJaccardPred(threshold):  
    predOutFile = open("predictions_Read.txt", 'w')
    bookDataTest = []
    
    # Read Testing set
    for l in open("pairs_Read.txt", 'r'):
        if l.startswith("userID"):
            #header
            predOutFile.write(l)
            continue
        uId, bId = l.strip().split('-')
        bookDataTest.append([uId, bId, -1])
    
    # Predict by Jaccard
    bookDataYTest = getJaccardPred(bookDataTest, threshold)
    
    # Write out prediction result
    for data, y in zip(bookDataTest, bookDataYTest):
        predOutFile.write(data[0] + '-' + data[1] + "," + str(y) + "\n")

    predOutFile.close()

In [43]:
writeOutJaccardPred(0.011)

In [27]:
def getClassificationData(Xdata):
    bookFeatures = []
    for uId, bId, rating in Xdata:
        sim = pairSimilarity(uId, bId)
        if sim:
            bookFeatures.append([sim[0][0], bookPercentile[bId]])
        else:
            bookFeatures.append([0, bookPercentile[bId]])
    return bookFeatures

In [66]:
def getClassificationData2(Xdata):
    bookFeatures = []
    for uId, bId, rating in Xdata:
        sim = pairSimilarity(uId, bId)
        sim2 = pairSimilarityByUser(uId, bId)
        
        feature = []
        if sim:
            feature.append(sim[0][0])
        else:
            feature.append(0)

        if sim2:
            feature.append(sim2[0][0])
        else:
            feature.append(0)
            
        feature.append(bookPercentile[bId])
        
        bookFeatures.append(feature)
    return bookFeatures

In [96]:
def getClassificationData3(Xdata):
    bookFeatures = []
    for uId, bId, rating in Xdata:
        sim = pairSimilarity(uId, bId)
        sim2 = pairSimilarityByUser(uId, bId)
        if sim:
            sim = statistics.mean([x[0] for x in sim])
        else:
            sim = 0

        if sim2:
            sim2 = statistics.mean([x[0] for x in sim2])
        else:
            sim2 = 0
        
        # Feature
        feature = [sim, sim2, bookPopularity[bId]]
        bookFeatures.append(feature)
    return bookFeatures

In [98]:
def getClassificationData3(Xdata, topK):
    bookFeatures = []
    for uId, bId, rating in Xdata:
        sim = pairSimilarity(uId, bId)
        sim2 = pairSimilarityByUser(uId, bId)
        if sim:
            sim = statistics.mean([x[0] for x in sim[:topK]])
        else:
            sim = 0

        if sim2:
            sim2 = statistics.mean([x[0] for x in sim2[:topK]])
        else:
            sim2 = 0
        
        # Feature
        feature = [sim, sim2, bookPopularity[bId]]
        bookFeatures.append(feature)
    return bookFeatures

In [159]:
def getClassificationData4(Xdata, topK):
    bookFeatures = []
    for uId, bId, rating in Xdata:
        sim = pairSimilarity(uId, bId)
        sim2 = pairSimilarityByUser(uId, bId)
        sim3 = pairCosineSimilarity(uId, bId)
        sim4 = pairCosineSimilarityByUser(uId, bId)
        if sim:
            sim = statistics.mean([x[0] for x in sim[:topK]])
        else:
            sim = 0

        if sim2:
            sim2 = statistics.mean([x[0] for x in sim2[:topK]])
        else:
            sim2 = 0
            
        if sim3:
            #sim3 = statistics.mean([x[0] for x in sim3[:topK]])
            sim3 = sim3[0][0]
        else:
            sim3 = 0

        if sim4:
            #sim4 = statistics.mean([x[0] for x in sim4[:topK]])
            sim4 = sim4[0][0]
        else:
            sim4 = 0
        
        # Feature
        feature = [sim, sim2, sim3, sim4, bookPopularity[bId]]
        bookFeatures.append(feature)
    return bookFeatures

In [28]:
def MSE(model, X, y):
    predictions = model.predict(X)
    differences = [(a-b)**2 for (a,b) in zip(predictions, y)]
    return sum(differences) / len(differences)

In [38]:
def MSE(predictions, y):
    differences = [(a-b)**2 for (a,b) in zip(predictions, y)]
    return sum(differences) / len(differences)

In [172]:
#Try Tuning top K + # of Negatives
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = [100777]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    for topK in range(8,13):
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print(min([x[0] for x in bookFeaturesArt]))
        print(min([x[1] for x in bookFeaturesArt]))
        print(min([x[2] for x in bookFeaturesArt]))
        print(max([x[0] for x in bookFeaturesArt]))
        print(max([x[1] for x in bookFeaturesArt]))
        print(max([x[2] for x in bookFeaturesArt]))

        for cVal in [0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005,0.01,0.05,0.1,0.5,1,5,10,50,100]:
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear", class_weight="balanced")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK), getClassificationData3, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100777
# total training data: 290777, # Neg: 100777

Running Top K: 8
0
0
0.0
0.10661075036075036
0.14856150793650794
1.0
Training: C=0.000001, acc=0.654618, TPR=0.999995, TNR=0.003463, MSE=0.345382
Validataion: C=0.000001, acc=0.499700, TPR=0.999400, TNR=0.000000, MSE=0.500300
Training: C=0.000005, acc=0.757391, TPR=0.999932, TNR=0.300118, MSE=0.242609
Validataion: C=0.000005, acc=0.453500, TPR=0.905300, TNR=0.001700, MSE=0.546500
Training: C=0.000010, acc=0.821489, TPR=0.960937, TNR=0.558580, MSE=0.178511
Validataion: C=0.000010, acc=0.376500, TPR=0.751000, TNR=0.002000, MSE=0.623500
Training: C=0.000050, acc=0.693542, TPR=0.633826, TNR=0.806126, MSE=0.306458
Validataion: C=0.000050, acc=0.757600, TPR=0.515300, TNR=0.999900, MSE=0.242400
Training: C=0.000100, acc=0.667904, TPR=0.578358, TNR=0.836729, MSE=0.332096
Validataion: C=0.000100, acc=0.733650, TPR=0.467300, TNR=1.000000, MSE=0.266350
Training: C=0.000500, acc=0.659499, TPR=0.549700, TNR=0.866507, MSE=0.340501


[(0.7576, 0.5153, 0.9999, 0.2424, 100777, 5e-05, 8),
 (0.757, 0.5141, 0.9999, 0.243, 100777, 5e-05, 9),
 (0.75635, 0.5128, 0.9999, 0.24365, 100777, 5e-05, 10),
 (0.7557, 0.5114, 1.0, 0.2443, 100777, 5e-05, 11),
 (0.75505, 0.5101, 1.0, 0.24495, 100777, 5e-05, 12),
 (0.73365, 0.4673, 1.0, 0.26635, 100777, 0.0001, 8),
 (0.73325, 0.4665, 1.0, 0.26675, 100777, 0.0001, 9),
 (0.7328, 0.4656, 1.0, 0.2672, 100777, 0.0001, 10),
 (0.73215, 0.4643, 1.0, 0.26785, 100777, 0.0001, 11),
 (0.7318, 0.4636, 1.0, 0.2682, 100777, 0.0001, 12)]

In [167]:
topK_numNegative_result[:100]

[(0.7385, 0.5473, 0.9297, 0.2615, 135000, 0.005, 12),
 (0.73325, 0.5467, 0.9198, 0.26675, 135000, 0.005, 11),
 (0.72895, 0.5462, 0.9117, 0.27105, 135000, 0.005, 10),
 (0.72665, 0.5165, 0.9368, 0.27335, 120000, 0.01, 12),
 (0.7252, 0.4939, 0.9565, 0.2748, 125000, 0.01, 12),
 (0.72475, 0.5717, 0.8778, 0.27525, 130000, 0.005, 12),
 (0.7237, 0.5479, 0.8995, 0.2763, 135000, 0.005, 9),
 (0.7231, 0.5399, 0.9063, 0.2769, 115000, 0.01, 12),
 (0.7231, 0.5176, 0.9286, 0.2769, 120000, 0.01, 11),
 (0.72225, 0.4956, 0.9489, 0.27775, 125000, 0.01, 11),
 (0.72115, 0.4735, 0.9688, 0.27885, 130000, 0.01, 12),
 (0.72085, 0.5705, 0.8712, 0.27915, 130000, 0.005, 11),
 (0.71955, 0.5397, 0.8994, 0.28045, 115000, 0.01, 11),
 (0.71945, 0.4755, 0.9634, 0.28055, 130000, 0.01, 11),
 (0.71935, 0.5505, 0.8882, 0.28065, 135000, 0.005, 8),
 (0.71885, 0.4968, 0.9409, 0.28115, 125000, 0.01, 10),
 (0.71835, 0.571, 0.8657, 0.28165, 130000, 0.005, 10),
 (0.71825, 0.519, 0.9175, 0.28175, 120000, 0.01, 10),
 (0.7173, 0.4786

In [165]:
topK_numNegative_result = predBookResult

In [164]:
#Try Tuning top K + # of Negatives
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = range(70000, 140000, 5000)
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataValid + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYValid + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    for topK in range(8,13):
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print(min([x[0] for x in bookFeaturesArt]))
        print(min([x[1] for x in bookFeaturesArt]))
        print(min([x[2] for x in bookFeaturesArt]))
        print(max([x[0] for x in bookFeaturesArt]))
        print(max([x[1] for x in bookFeaturesArt]))
        print(max([x[2] for x in bookFeaturesArt]))

        for cVal in [0.005,0.01,0.05,0.1,0.5,1,5,10,50,100]:
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK), getClassificationData3, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 70000
# total training data: 280000, # Neg: 70000

Running Top K: 8
0
0
0.0
0.10661075036075036
0.14856150793650794
1.0
Training: C=0.005000, acc=0.714286, TPR=1.000000, TNR=0.000000, MSE=0.285714
Validataion: C=0.005000, acc=0.500000, TPR=1.000000, TNR=0.000000, MSE=0.500000
Training: C=0.010000, acc=0.792504, TPR=0.995465, TNR=0.285100, MSE=0.207496
Validataion: C=0.010000, acc=0.455100, TPR=0.909400, TNR=0.000800, MSE=0.544900
Training: C=0.050000, acc=0.940564, TPR=0.975300, TNR=0.853725, MSE=0.059436
Validataion: C=0.050000, acc=0.687750, TPR=0.506100, TNR=0.869400, MSE=0.312250
Training: C=0.100000, acc=0.949911, TPR=0.971750, TNR=0.895312, MSE=0.050089
Validataion: C=0.100000, acc=0.673100, TPR=0.435100, TNR=0.911100, MSE=0.326900
Training: C=0.500000, acc=0.956829, TPR=0.967270, TNR=0.930725, MSE=0.043171
Validataion: C=0.500000, acc=0.643300, TPR=0.345500, TNR=0.941100, MSE=0.356700
Training: C=1.000000, acc=0.957614, TPR=0.966185, TNR=0.936187, MSE=0.042386
Va

[(0.7385, 0.5473, 0.9297, 0.2615, 135000, 0.005, 12),
 (0.73325, 0.5467, 0.9198, 0.26675, 135000, 0.005, 11),
 (0.72895, 0.5462, 0.9117, 0.27105, 135000, 0.005, 10),
 (0.72665, 0.5165, 0.9368, 0.27335, 120000, 0.01, 12),
 (0.7252, 0.4939, 0.9565, 0.2748, 125000, 0.01, 12),
 (0.72475, 0.5717, 0.8778, 0.27525, 130000, 0.005, 12),
 (0.7237, 0.5479, 0.8995, 0.2763, 135000, 0.005, 9),
 (0.7231, 0.5399, 0.9063, 0.2769, 115000, 0.01, 12),
 (0.7231, 0.5176, 0.9286, 0.2769, 120000, 0.01, 11),
 (0.72225, 0.4956, 0.9489, 0.27775, 125000, 0.01, 11)]

In [160]:
# Similarity => Use Train bookData
#Includes Another Similarity feature [simItem, simUser, cosineSimItem, cosineSimUser, new percentile]
numNegativeExamples = [100168]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    #bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    #bookDataArt = bookDataTrain + bookDataValid + bookDataNeg
    #bookDataYArt = bookDataYTrain + bookDataYValid + bookDataYNeg
    bookDataArt = bookDataValid
    bookDataYArt = bookDataYValid
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    for topK in [1]:
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData4(bookDataArt, topK)
        bookFeaturesValid = getClassificationData4(bookDataValid, topK)
        print(min([x[0] for x in bookFeaturesArt]))
        print(min([x[1] for x in bookFeaturesArt]))
        print(min([x[2] for x in bookFeaturesArt]))
        print(max([x[0] for x in bookFeaturesArt]))
        print(max([x[1] for x in bookFeaturesArt]))
        print(max([x[2] for x in bookFeaturesArt]))

        for cExp in range(-3,3):
            cVal = pow(10,cExp)
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK)+"_simOnTrain_CosineSim", getClassificationData4, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100168
# total training data: 20000, # Neg: 100168

Running Top K: 1
0.0
0.0
[[0.]]
0.09523809523809523
0.13636363636363635
[[0.22980099]]
Training: C=0.001000, acc=0.719150, TPR=0.569000, TNR=0.869300, MSE=0.280850
Validataion: C=0.001000, acc=0.719150, TPR=0.569000, TNR=0.869300, MSE=0.280850
Training: C=0.010000, acc=0.734000, TPR=0.501600, TNR=0.966400, MSE=0.266000
Validataion: C=0.010000, acc=0.734000, TPR=0.501600, TNR=0.966400, MSE=0.266000
Training: C=0.100000, acc=0.720400, TPR=0.532900, TNR=0.907900, MSE=0.279600
Validataion: C=0.100000, acc=0.720400, TPR=0.532900, TNR=0.907900, MSE=0.279600
Training: C=1.000000, acc=0.714850, TPR=0.558100, TNR=0.871600, MSE=0.285150
Validataion: C=1.000000, acc=0.714850, TPR=0.558100, TNR=0.871600, MSE=0.285150
Training: C=10.000000, acc=0.722500, TPR=0.572800, TNR=0.872200, MSE=0.277500
Validataion: C=10.000000, acc=0.722500, TPR=0.572800, TNR=0.872200, MSE=0.277500
Training: C=100.000000, acc=0.722550, TPR=0.585000, TNR=0.

[(0.734, 0.5016, 0.9664, 0.266, 100168, 0.01, 1),
 (0.72255, 0.585, 0.8601, 0.27745, 100168, 100, 1),
 (0.7225, 0.5728, 0.8722, 0.2775, 100168, 10, 1),
 (0.7204, 0.5329, 0.9079, 0.2796, 100168, 0.1, 1),
 (0.71915, 0.569, 0.8693, 0.28085, 100168, 0.001, 1),
 (0.71485, 0.5581, 0.8716, 0.28515, 100168, 1, 1)]

In [152]:
# Similarity => Use whole bookData to Train
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = [100168]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataValid + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYValid + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    for topK in [1]:
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print(min([x[0] for x in bookFeaturesArt]))
        print(min([x[1] for x in bookFeaturesArt]))
        print(min([x[2] for x in bookFeaturesArt]))
        print(max([x[0] for x in bookFeaturesArt]))
        print(max([x[1] for x in bookFeaturesArt]))
        print(max([x[2] for x in bookFeaturesArt]))

        for cExp in range(-3,3):
            cVal = pow(10,cExp)
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK)+"_simOnAll", getClassificationData3, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100168
# total training data: 310168, # Neg: 100168

Running Top K: 1
0
0
0.0
0.19047619047619047
0.3076923076923077
1.0
Training: C=0.001000, acc=0.644812, TPR=1.000000, TNR=0.000000, MSE=0.355188
Validataion: C=0.001000, acc=0.500000, TPR=1.000000, TNR=0.000000, MSE=0.500000
Training: C=0.010000, acc=0.874371, TPR=0.999525, TNR=0.647166, MSE=0.125629
Validataion: C=0.010000, acc=0.799500, TPR=0.999100, TNR=0.599900, MSE=0.200500
Training: C=0.100000, acc=0.941880, TPR=0.990580, TNR=0.853469, MSE=0.058120
Validataion: C=0.100000, acc=0.934500, TPR=0.991500, TNR=0.877500, MSE=0.065500
Training: C=1.000000, acc=0.951500, TPR=0.980635, TNR=0.898609, MSE=0.048500
Validataion: C=1.000000, acc=0.947050, TPR=0.982200, TNR=0.911900, MSE=0.052950
Training: C=10.000000, acc=0.951568, TPR=0.974940, TNR=0.909139, MSE=0.048432
Validataion: C=10.000000, acc=0.947650, TPR=0.975400, TNR=0.919900, MSE=0.052350
Training: C=100.000000, acc=0.951433, TPR=0.973680, TNR=0.911045, MSE=0.0485

[(0.94765, 0.9754, 0.9199, 0.05235, 100168, 10, 1),
 (0.9476, 0.9738, 0.9214, 0.0524, 100168, 100, 1),
 (0.94705, 0.9822, 0.9119, 0.05295, 100168, 1, 1),
 (0.9345, 0.9915, 0.8775, 0.0655, 100168, 0.1, 1),
 (0.7995, 0.9991, 0.5999, 0.2005, 100168, 0.01, 1),
 (0.5, 1.0, 0.0, 0.5, 100168, 0.001, 1)]

In [179]:
# Similarity => Use whole bookData to Train
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = [100168]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataValid + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYValid + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    #for topK in range(1,11):
    for topK in [5,8,9,10,11]:
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print(min([x[0] for x in bookFeaturesArt]))
        print(min([x[1] for x in bookFeaturesArt]))
        print(min([x[2] for x in bookFeaturesArt]))
        print(max([x[0] for x in bookFeaturesArt]))
        print(max([x[1] for x in bookFeaturesArt]))
        print(max([x[2] for x in bookFeaturesArt]))

        for cVal in [0.005,0.05]:
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK)+"_simOnAll_123", getClassificationData3, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100168
# total training data: 310168, # Neg: 100168

Running Top K: 5


KeyboardInterrupt: 

In [120]:
# Similarity => Use whole bookData to Train
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = [100168]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataValid + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYValid + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    #for topK in range(1,11):
    for topK in [5,10]:
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print(min([x[0] for x in bookFeaturesArt]))
        print(min([x[1] for x in bookFeaturesArt]))
        print(min([x[2] for x in bookFeaturesArt]))
        print(max([x[0] for x in bookFeaturesArt]))
        print(max([x[1] for x in bookFeaturesArt]))
        print(max([x[2] for x in bookFeaturesArt]))

        for cExp in range(-3,3):
            cVal = pow(10,cExp)
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK)+"_simOnAll", getClassificationData3, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100168
# total training data: 310168, # Neg: 100168

Running Top K: 5
0
0
0.0
0.12277777777777778
0.1638095238095238
1.0
Training: C=0.001000, acc=0.644812, TPR=1.000000, TNR=0.000000, MSE=0.355188
Validataion: C=0.001000, acc=0.500000, TPR=1.000000, TNR=0.000000, MSE=0.500000
Training: C=0.010000, acc=0.905100, TPR=0.999990, TNR=0.732835, MSE=0.094900
Validataion: C=0.010000, acc=0.840950, TPR=0.999900, TNR=0.682000, MSE=0.159050
Training: C=0.100000, acc=0.968075, TPR=0.999875, TNR=0.910346, MSE=0.031925
Validataion: C=0.100000, acc=0.963200, TPR=0.999900, TNR=0.926500, MSE=0.036800
Training: C=1.000000, acc=0.984634, TPR=0.998740, TNR=0.959026, MSE=0.015366
Validataion: C=1.000000, acc=0.977500, TPR=0.998900, TNR=0.956100, MSE=0.022500
Training: C=10.000000, acc=0.989151, TPR=0.996785, TNR=0.975292, MSE=0.010849
Validataion: C=10.000000, acc=0.983900, TPR=0.997700, TNR=0.970100, MSE=0.016100
Training: C=100.000000, acc=0.990121, TPR=0.995395, TNR=0.980548, MSE=0.0098

[(0.9938, 0.9982, 0.9894, 0.0062, 100168, 100, 10),
 (0.99285, 0.9988, 0.9869, 0.00715, 100168, 10, 10),
 (0.99185, 0.9994, 0.9843, 0.00815, 100168, 1, 10),
 (0.988, 0.9999, 0.9761, 0.012, 100168, 0.1, 10),
 (0.98495, 0.9957, 0.9742, 0.01505, 100168, 100, 5),
 (0.9839, 0.9977, 0.9701, 0.0161, 100168, 10, 5),
 (0.9775, 0.9989, 0.9561, 0.0225, 100168, 1, 5),
 (0.9632, 0.9999, 0.9265, 0.0368, 100168, 0.1, 5),
 (0.8715, 0.9999, 0.7431, 0.1285, 100168, 0.01, 10),
 (0.84095, 0.9999, 0.682, 0.15905, 100168, 0.01, 5)]

In [113]:
# Logistic Regression + Feature Scaling
numNegativeExamples = [100777]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )    
    
    #for topK in range(4,13):
    for topK in [5,10]:
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print("x[0]: min: %f, max %f" % (min([x[0] for x in bookFeaturesArt]), max([x[0] for x in bookFeaturesArt])))
        print("x[1]: min: %f, max %f" % (min([x[1] for x in bookFeaturesArt]), max([x[1] for x in bookFeaturesArt])))
        print("x[2]: min: %f, max %f" % (min([x[2] for x in bookFeaturesArt]), max([x[2] for x in bookFeaturesArt])))
        
        # Feature Scaling
        scaler = StandardScaler()
        scaler.fit(bookFeaturesArt)
        bookFeaturesArt = scaler.transform(bookFeaturesArt)
        bookFeaturesValid = scaler.transform(bookFeaturesValid)
        print("x[0]: min: %f, max %f" % (min([x[0] for x in bookFeaturesArt]), max([x[0] for x in bookFeaturesArt])))
        print("x[1]: min: %f, max %f" % (min([x[1] for x in bookFeaturesArt]), max([x[1] for x in bookFeaturesArt])))
        print("x[2]: min: %f, max %f" % (min([x[2] for x in bookFeaturesArt]), max([x[2] for x in bookFeaturesArt])))
        

        for cExp in range(-3,5):
            cVal = pow(10,cExp)
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK)+"_FtScaling", getClassificationData3, topK, scaler)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100777
# total training data: 290777, # Neg: 100777

Running Top K: 5
x[0]: min: 0.000000, max 0.117547
x[1]: min: 0.000000, max 0.163810
x[2]: min: 0.000000, max 1.000000
x[0]: min: -1.300082, max 6.337370
x[1]: min: -1.445028, max 4.696226
x[2]: min: -0.922947, max 7.081173
Training: C=0.001000, acc=0.990477, TPR=0.998789, TNR=0.974806, MSE=0.009523
Validataion: C=0.001000, acc=0.565450, TPR=0.165400, TNR=0.965500, MSE=0.434550
Training: C=0.010000, acc=0.991987, TPR=0.997079, TNR=0.982387, MSE=0.008013
Validataion: C=0.010000, acc=0.547350, TPR=0.123100, TNR=0.971600, MSE=0.452650
Training: C=0.100000, acc=0.992176, TPR=0.996053, TNR=0.984868, MSE=0.007824
Validataion: C=0.100000, acc=0.541500, TPR=0.110100, TNR=0.972900, MSE=0.458500
Training: C=1.000000, acc=0.992180, TPR=0.995795, TNR=0.985364, MSE=0.007820
Validataion: C=1.000000, acc=0.540350, TPR=0.107800, TNR=0.972900, MSE=0.459650
Training: C=10.000000, acc=0.992152, TPR=0.995742, TNR=0.985384, MSE=0.007848
V

[(0.56545, 0.1654, 0.9655, 0.43455, 100777, 0.001, 5),
 (0.54735, 0.1231, 0.9716, 0.45265, 100777, 0.01, 5),
 (0.5461, 0.1037, 0.9885, 0.4539, 100777, 0.001, 10),
 (0.5415, 0.1101, 0.9729, 0.4585, 100777, 0.1, 5),
 (0.54035, 0.1078, 0.9729, 0.45965, 100777, 1, 5),
 (0.54035, 0.1077, 0.973, 0.45965, 100777, 10000, 5),
 (0.54035, 0.1077, 0.973, 0.45965, 100777, 1000, 5),
 (0.54035, 0.1077, 0.973, 0.45965, 100777, 100, 5),
 (0.5403, 0.1077, 0.9729, 0.4597, 100777, 10, 5),
 (0.52935, 0.069, 0.9897, 0.47065, 100777, 0.01, 10)]

In [110]:
# KNN
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = [100168]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )    
    
    #for topK in range(4,13):
    for topK in [5]:
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print("x[0]: min: %f, max %f" % (min([x[0] for x in bookFeaturesArt]), max([x[0] for x in bookFeaturesArt])))
        print("x[1]: min: %f, max %f" % (min([x[1] for x in bookFeaturesArt]), max([x[1] for x in bookFeaturesArt])))
        print("x[2]: min: %f, max %f" % (min([x[2] for x in bookFeaturesArt]), max([x[2] for x in bookFeaturesArt])))
        
        # Feature Scaling
        scaler = StandardScaler()
        scaler.fit(bookFeaturesArt)
        bookFeaturesArt = scaler.transform(bookFeaturesArt)
        bookFeaturesValid = scaler.transform(bookFeaturesValid)
        print("x[0]: min: %f, max %f" % (min([x[0] for x in bookFeaturesArt]), max([x[0] for x in bookFeaturesArt])))
        print("x[1]: min: %f, max %f" % (min([x[1] for x in bookFeaturesArt]), max([x[1] for x in bookFeaturesArt])))
        print("x[2]: min: %f, max %f" % (min([x[2] for x in bookFeaturesArt]), max([x[2] for x in bookFeaturesArt])))
        
        for numNeighbors in range(1,10):
            model = KNeighborsClassifier(n_neighbors=numNeighbors)
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            #writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK), getClassificationData3, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: numNeighbors=%d, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (numNeighbors, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: numNeighbors=%d, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (numNeighbors, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100168
# total training data: 290168, # Neg: 100168

Running Top K: 5
x[0]: min: 0.000000, max 0.117547
x[1]: min: 0.000000, max 0.163810
x[2]: min: 0.000000, max 1.000000
x[0]: min: -1.301944, max 6.334979
x[1]: min: -1.447231, max 4.694284
x[2]: min: -0.921960, max 7.069110
Training: numNeighbors=1, acc=1.000000, TPR=1.000000, TNR=1.000000, MSE=0.000000
Validataion: numNeighbors=1, acc=0.546650, TPR=0.112500, TNR=0.980800, MSE=0.453350
Training: numNeighbors=2, acc=0.994803, TPR=0.992063, TNR=1.000000, MSE=0.005197
Validataion: numNeighbors=2, acc=0.534800, TPR=0.084700, TNR=0.984900, MSE=0.465200
Training: numNeighbors=3, acc=0.994848, TPR=0.997726, TNR=0.989388, MSE=0.005152
Validataion: numNeighbors=3, acc=0.546600, TPR=0.111700, TNR=0.981500, MSE=0.453400
Training: numNeighbors=4, acc=0.994214, TPR=0.995758, TNR=0.991285, MSE=0.005786
Validataion: numNeighbors=4, acc=0.539600, TPR=0.095500, TNR=0.983700, MSE=0.460400
Training: numNeighbors=5, acc=0.994162, TPR=0.9

[(0.54665, 0.1125, 0.9808, 0.45335, 100168, 10000, 5),
 (0.5466, 0.1117, 0.9815, 0.4534, 100168, 10000, 5),
 (0.54645, 0.1119, 0.981, 0.45355, 100168, 10000, 5),
 (0.54635, 0.1123, 0.9804, 0.45365, 100168, 10000, 5),
 (0.54635, 0.1122, 0.9805, 0.45365, 100168, 10000, 5),
 (0.54255, 0.1037, 0.9814, 0.45745, 100168, 10000, 5),
 (0.54155, 0.1012, 0.9819, 0.45845, 100168, 10000, 5),
 (0.5396, 0.0955, 0.9837, 0.4604, 100168, 10000, 5),
 (0.5348, 0.0847, 0.9849, 0.4652, 100168, 10000, 5)]

In [104]:
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = [100168]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataValid + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYValid + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    for topK in range(1,11):
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print(min([x[0] for x in bookFeaturesArt]))
        print(min([x[1] for x in bookFeaturesArt]))
        print(min([x[2] for x in bookFeaturesArt]))
        print(max([x[0] for x in bookFeaturesArt]))
        print(max([x[1] for x in bookFeaturesArt]))
        print(max([x[2] for x in bookFeaturesArt]))

        for cExp in range(-3,5):
            cVal = pow(10,cExp)
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK), getClassificationData3, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100168
# total training data: 310168, # Neg: 100168

Running Top K: 1
0
0
0.0
0.19047619047619047
0.36363636363636365
1.0
Training: C=0.001000, acc=0.644812, TPR=1.000000, TNR=0.000000, MSE=0.355188
Validataion: C=0.001000, acc=0.500000, TPR=1.000000, TNR=0.000000, MSE=0.500000
Training: C=0.010000, acc=0.858538, TPR=0.986215, TNR=0.626752, MSE=0.141462
Validataion: C=0.010000, acc=0.642350, TPR=0.727100, TNR=0.557600, MSE=0.357650
Training: C=0.100000, acc=0.920408, TPR=0.969905, TNR=0.830550, MSE=0.079592
Validataion: C=0.100000, acc=0.668750, TPR=0.483500, TNR=0.854000, MSE=0.331250
Training: C=1.000000, acc=0.930925, TPR=0.961570, TNR=0.875290, MSE=0.069075
Validataion: C=1.000000, acc=0.637250, TPR=0.386400, TNR=0.888100, MSE=0.362750
Training: C=10.000000, acc=0.932214, TPR=0.959160, TNR=0.883296, MSE=0.067786
Validataion: C=10.000000, acc=0.630400, TPR=0.366700, TNR=0.894100, MSE=0.369600
Training: C=100.000000, acc=0.932472, TPR=0.958875, TNR=0.884540, MSE=0.067

[(0.6761, 0.6248, 0.7274, 0.3239, 100168, 0.01, 9),
 (0.67595, 0.6251, 0.7268, 0.32405, 100168, 0.01, 8),
 (0.67575, 0.6265, 0.725, 0.32425, 100168, 0.01, 10),
 (0.6754, 0.6306, 0.7202, 0.3246, 100168, 0.01, 7),
 (0.6714, 0.6361, 0.7067, 0.3286, 100168, 0.01, 6),
 (0.66875, 0.4835, 0.854, 0.33125, 100168, 0.1, 1),
 (0.6672, 0.468, 0.8664, 0.3328, 100168, 0.1, 2),
 (0.66645, 0.647, 0.6859, 0.33355, 100168, 0.01, 5),
 (0.66635, 0.4397, 0.893, 0.33365, 100168, 0.1, 4),
 (0.66485, 0.4547, 0.875, 0.33515, 100168, 0.1, 3)]

In [105]:
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = [100168]
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataValid + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYValid + bookDataYNeg
    print("# total training data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    for topK in range(10,20):
        print("\nRunning Top K: %d" % (topK))
        
        # Data to Features [simItem, simUser, percentile]
        bookFeaturesArt = getClassificationData3(bookDataArt, topK)
        bookFeaturesValid = getClassificationData3(bookDataValid, topK)
        print(min([x[0] for x in bookFeaturesArt]))
        print(min([x[1] for x in bookFeaturesArt]))
        print(min([x[2] for x in bookFeaturesArt]))
        print(max([x[0] for x in bookFeaturesArt]))
        print(max([x[1] for x in bookFeaturesArt]))
        print(max([x[2] for x in bookFeaturesArt]))

        for cExp in range(-3,5):
            cVal = pow(10,cExp)
            model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
            model.fit(bookFeaturesArt, bookDataYArt)

            predBookDataYArt = model.predict(bookFeaturesArt)
            mseTrain = MSE(predBookDataYArt, bookDataYArt)
            predBookDataYValid = model.predict(bookFeaturesValid)
            mseValid = MSE(predBookDataYValid, bookDataYValid)

            writeOutClassificationPredWithFunction(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal)+"_topK_"+str(topK), getClassificationData3, topK)

            acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
            print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
            acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
            print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
            predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal, topK))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 100168
# total training data: 310168, # Neg: 100168

Running Top K: 10
0
0
0.0
0.10098090798090797
0.1404040404040404
1.0
Training: C=0.001000, acc=0.644812, TPR=1.000000, TNR=0.000000, MSE=0.355188
Validataion: C=0.001000, acc=0.500000, TPR=1.000000, TNR=0.000000, MSE=0.500000
Training: C=0.010000, acc=0.895605, TPR=0.981325, TNR=0.739988, MSE=0.104395
Validataion: C=0.010000, acc=0.675600, TPR=0.626600, TNR=0.724600, MSE=0.324400
Training: C=0.100000, acc=0.956814, TPR=0.966380, TNR=0.939447, MSE=0.043186
Validataion: C=0.100000, acc=0.647650, TPR=0.327700, TNR=0.967600, MSE=0.352350
Training: C=1.000000, acc=0.961708, TPR=0.962010, TNR=0.961159, MSE=0.038292
Validataion: C=1.000000, acc=0.604850, TPR=0.240400, TNR=0.969300, MSE=0.395150
Training: C=10.000000, acc=0.962024, TPR=0.961350, TNR=0.963247, MSE=0.037976
Validataion: C=10.000000, acc=0.595000, TPR=0.227100, TNR=0.962900, MSE=0.405000
Training: C=100.000000, acc=0.962033, TPR=0.961605, TNR=0.962811, MSE=0.037

[(0.6756, 0.6266, 0.7246, 0.3244, 100168, 0.01, 10),
 (0.6724, 0.6269, 0.7179, 0.3276, 100168, 0.01, 11),
 (0.6683, 0.6286, 0.708, 0.3317, 100168, 0.01, 12),
 (0.6628, 0.6303, 0.6953, 0.3372, 100168, 0.01, 13),
 (0.656, 0.6328, 0.6792, 0.344, 100168, 0.01, 14),
 (0.64875, 0.6364, 0.6611, 0.35125, 100168, 0.01, 15),
 (0.64765, 0.3277, 0.9676, 0.35235, 100168, 0.1, 10),
 (0.6444, 0.3141, 0.9747, 0.3556, 100168, 0.1, 11),
 (0.64095, 0.3024, 0.9795, 0.35905, 100168, 0.1, 12),
 (0.6391, 0.6392, 0.639, 0.3609, 100168, 0.01, 16)]

In [76]:
#Includes Another Similarity feature [simItem, simUser, new percentile]
numNegativeExamples = range(70000, 190000, 5000)
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataValid + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYValid + bookDataYNeg
    print("# total trainging data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    # Data to Features [simItem, simUser, percentile]
    bookFeaturesArt = getClassificationData2(bookDataArt)
    bookFeaturesValid = getClassificationData2(bookDataValid)
    
    for cExp in range(-3,5):
        cVal = pow(10,cExp)
        model = linear_model.LogisticRegression(C=cVal, solver="liblinear")
        model.fit(bookFeaturesArt, bookDataYArt)

        predBookDataYArt = model.predict(bookFeaturesArt)
        mseTrain = MSE(predBookDataYArt, bookDataYArt)
        predBookDataYValid = model.predict(bookFeaturesValid)
        mseValid = MSE(predBookDataYValid, bookDataYValid)

        writeOutClassificationPred2(model, "numNeg_"+str(numNeg)+"_C_"+str(cVal))

        acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
        print("Training: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseTrain) )
        acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
        print("Validataion: C=%f, acc=%f, TPR=%f, TNR=%f, MSE=%f" % (cVal, acc, TPR, TNR, mseValid) )
        predBookResult.append((acc, TPR, TNR, mseValid, numNeg, cVal))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 70000
# total trainging data: 280000, # Neg: 70000
Training: C=0.000100, acc=0.714286, TPR=1.000000, TNR=0.000000, MSE=0.285714
Validataion: C=0.000100, acc=0.500000, TPR=1.000000, TNR=0.000000, MSE=0.500000
Training: C=0.001000, acc=0.724993, TPR=0.999695, TNR=0.038238, MSE=0.275007
Validataion: C=0.001000, acc=0.496950, TPR=0.993900, TNR=0.000000, MSE=0.503050
Training: C=0.010000, acc=0.836514, TPR=0.991140, TNR=0.449950, MSE=0.163486
Validataion: C=0.010000, acc=0.512250, TPR=0.832700, TNR=0.191800, MSE=0.487750
Training: C=0.100000, acc=0.911889, TPR=0.980530, TNR=0.740287, MSE=0.088111
Validataion: C=0.100000, acc=0.671350, TPR=0.622300, TNR=0.720400, MSE=0.328650
Training: C=1.000000, acc=0.934425, TPR=0.971515, TNR=0.841700, MSE=0.065575
Validataion: C=1.000000, acc=0.656550, TPR=0.473900, TNR=0.839200, MSE=0.343450
Training: C=10.000000, acc=0.937775, TPR=0.969030, TNR=0.859638, MSE=0.062225
Validataion: C=10.000000, acc=0.648700, TPR=0.438700, TNR=0.858700, MS

KeyboardInterrupt: 

In [62]:
#bookDataValid = bookDataValidPos + bookDataValidNeg
#bookDataYValid = bookDataYValidPos + bookDataYValidNeg
numNegativeExamples = range(5000, 250000, 5000)
predBookResult = []

for numNeg in numNegativeExamples:
    print("\nRunning # Neg: %d" % (numNeg) )
    bookDataNeg, bookDataYNeg = getNegativeEntries(numNeg)
    bookDataArt = bookDataTrain + bookDataNeg
    bookDataYArt = bookDataYTrain + bookDataYNeg
    print("# total trainging data: %d, # Neg: %d" % (len(bookDataArt), len(bookDataNeg)) )
    
    # Data to Features
    bookFeaturesArt = getClassificationData(bookDataArt)
    
    model = linear_model.LogisticRegression(solver="liblinear")
    model.fit(bookFeaturesArt, bookDataYArt)

    predBookDataYArt = model.predict(bookFeaturesArt)
    mseTrain = MSE(predBookDataYArt, bookDataYArt)
    predBookDataYValid = model.predict(bookFeaturesValid)
    mseValid = MSE(predBookDataYValid, bookDataYValid)

    writeOutClassificationPred(model, "numNeg_"+str(numNeg))

    acc, TPR, TNR = getMetrics(predBookDataYArt, bookDataYArt)
    print("Training: acc=%f, TPR=%f, TNR=%f, MSE=%f" % (acc, TPR, TNR, mseTrain) )
    acc, TPR, TNR = getMetrics(predBookDataYValid, bookDataYValid)
    print("Validataion: acc=%f, TPR=%f, TNR=%f, MSE=%f" % (acc, TPR, TNR, mseValid) )
    predBookResult.append((acc, TPR, TNR, mseValid, numNeg))
    
predBookResult.sort(reverse = True)
predBookResult[:10]


Running # Neg: 5000
# total trainging data: 195000, # Neg: 5000
Training: acc=0.980626, TPR=0.999995, TNR=0.244600, MSE=0.019374
Validataion: acc=0.471350, TPR=0.942200, TNR=0.000500, MSE=0.528650

Running # Neg: 10000
# total trainging data: 200000, # Neg: 10000
Training: acc=0.971860, TPR=0.999879, TNR=0.439500, MSE=0.028140
Validataion: acc=0.531900, TPR=0.872800, TNR=0.191000, MSE=0.468100

Running # Neg: 15000
# total trainging data: 205000, # Neg: 15000
Training: acc=0.965210, TPR=0.999568, TNR=0.530000, MSE=0.034790
Validataion: acc=0.560550, TPR=0.828000, TNR=0.293100, MSE=0.439450

Running # Neg: 20000
# total trainging data: 210000, # Neg: 20000
Training: acc=0.958614, TPR=0.999184, TNR=0.573200, MSE=0.041386
Validataion: acc=0.588200, TPR=0.798100, TNR=0.378300, MSE=0.411800

Running # Neg: 25000
# total trainging data: 215000, # Neg: 25000
Training: acc=0.953637, TPR=0.998689, TNR=0.611240, MSE=0.046363
Validataion: acc=0.609250, TPR=0.770800, TNR=0.447700, MSE=0.390750

R

[(0.67725, 0.5011, 0.8534, 0.32275),
 (0.67575, 0.5687, 0.7828, 0.32425),
 (0.6742, 0.5291, 0.8193, 0.3258),
 (0.67385, 0.5483, 0.7994, 0.32615),
 (0.67345, 0.5784, 0.7685, 0.32655),
 (0.6729, 0.4924, 0.8534, 0.3271),
 (0.67195, 0.5183, 0.8256, 0.32805),
 (0.6714, 0.6003, 0.7425, 0.3286),
 (0.67, 0.5572, 0.7828, 0.33),
 (0.6699, 0.4732, 0.8666, 0.3301)]

In [55]:
def writeOutClassificationPred(model, nameTag):
    fileName = "predictions_Read_classification_" + nameTag + ".txt"
    predOutFile = open(fileName, 'w')
    bookDataTest = []
    
    # Read Testing set
    for l in open("pairs_Read.txt", 'r'):
        if l.startswith("userID"):
            #header
            predOutFile.write(l)
            continue
        uId, bId = l.strip().split('-')
        bookDataTest.append([uId, bId, -1])
    
    bookFeaturesTest = getClassificationData(bookDataTest)
    bookFeaturesYTest = model.predict(bookFeaturesTest)
    
    # Write out prediction result
    for data, y in zip(bookDataTest, bookFeaturesYTest):
        predOutFile.write(data[0] + '-' + data[1] + "," + str(y) + "\n")

    predOutFile.close()

In [70]:
def writeOutClassificationPred2(model, nameTag):
    fileName = "predictions_Read_classification_" + nameTag + ".txt"
    predOutFile = open(fileName, 'w')
    bookDataTest = []
    
    # Read Testing set
    for l in open("pairs_Read.txt", 'r'):
        if l.startswith("userID"):
            #header
            predOutFile.write(l)
            continue
        uId, bId = l.strip().split('-')
        bookDataTest.append([uId, bId, -1])
    
    bookFeaturesTest = getClassificationData2(bookDataTest)
    bookFeaturesYTest = model.predict(bookFeaturesTest)
    
    # Write out prediction result
    for data, y in zip(bookDataTest, bookFeaturesYTest):
        predOutFile.write(data[0] + '-' + data[1] + "," + str(y) + "\n")

    predOutFile.close()

In [85]:
def writeOutClassificationPredWithFunction(model, nameTag, featureFunction):
    fileName = "predictions_Read_classification_" + nameTag + ".txt"
    predOutFile = open(fileName, 'w')
    bookDataTest = []
    
    # Read Testing set
    for l in open("pairs_Read.txt", 'r'):
        if l.startswith("userID"):
            #header
            predOutFile.write(l)
            continue
        uId, bId = l.strip().split('-')
        bookDataTest.append([uId, bId, -1])
    
    bookFeaturesTest = featureFunction(bookDataTest)
    bookFeaturesYTest = model.predict(bookFeaturesTest)
    
    # Write out prediction result
    for data, y in zip(bookDataTest, bookFeaturesYTest):
        predOutFile.write(data[0] + '-' + data[1] + "," + str(y) + "\n")

    predOutFile.close()

In [119]:
def writeOutClassificationPredWithFunction(model, nameTag, featureFunction, topK):
    fileName = "predictions_Read_classification_" + nameTag + ".txt"
    predOutFile = open(fileName, 'w')
    bookDataTest = []
    
    # Read Testing set
    for l in open("pairs_Read.txt", 'r'):
        if l.startswith("userID"):
            #header
            predOutFile.write(l)
            continue
        uId, bId = l.strip().split('-')
        bookDataTest.append([uId, bId, -1])
    
    bookFeaturesTest = featureFunction(bookDataTest, topK)
    bookFeaturesYTest = model.predict(bookFeaturesTest)
    
    # Write out prediction result
    for data, y in zip(bookDataTest, bookFeaturesYTest):
        predOutFile.write(data[0] + '-' + data[1] + "," + str(y) + "\n")

    predOutFile.close()

In [111]:
def writeOutClassificationPredWithFunction(model, nameTag, featureFunction, topK, scaler):
    fileName = "predictions_Read_classification_" + nameTag + ".txt"
    predOutFile = open(fileName, 'w')
    bookDataTest = []
    
    # Read Testing set
    for l in open("pairs_Read.txt", 'r'):
        if l.startswith("userID"):
            #header
            predOutFile.write(l)
            continue
        uId, bId = l.strip().split('-')
        bookDataTest.append([uId, bId, -1])
    
    bookFeaturesTest = featureFunction(bookDataTest, topK)
    bookFeaturesTest = scaler.transform(bookFeaturesTest)
    bookFeaturesYTest = model.predict(bookFeaturesTest)
    
    # Write out prediction result
    for data, y in zip(bookDataTest, bookFeaturesYTest):
        predOutFile.write(data[0] + '-' + data[1] + "," + str(y) + "\n")

    predOutFile.close()