In [1]:
import gzip
from collections import defaultdict

def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

### Rating baseline: compute averages for each user, or return the global average if we've never seen the user before

allRatings = []
userRatings = defaultdict(list)
for l in readGz("train.json.gz"):
  user,business = l['userID'],l['businessID']
  allRatings.append(l['rating'])
  userRatings[user].append(l['rating'])

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
  userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

predictions = open("predictions_Rating.txt", 'w')
for l in open("pairs_Rating.txt"):
  if l.startswith("userID"):
    #header
    predictions.write(l)
    continue
  u,i = l.strip().split('-')
  if u in userAverage:
    predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
  else:
    predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()

### Would-visit baseline: just rank which businesses are popular and which are not, and return '1' if a business is among the top-ranked

businessCount = defaultdict(int)
totalPurchases = 0

for l in readGz("train.json.gz"):
  user,business = l['userID'],l['businessID']
  businessCount[business] += 1
  totalPurchases += 1

mostPopular = [(businessCount[x], x) for x in businessCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
  count += ic
  return1.add(i)
  if count > totalPurchases/2: break

predictions = open("predictions_Visit.txt", 'w')
for l in open("pairs_Visit.txt"):
  if l.startswith("userID"):
    #header
    predictions.write(l)
    continue
  u,i = l.strip().split('-')
  if i in return1:
    predictions.write(u + '-' + i + ",1\n")
  else:
    predictions.write(u + '-' + i + ",0\n")

predictions.close()

### Category prediction baseline: Just consider some of the most common words from each category

catDict = {
  "American Restaurant": 0,
  "Bar": 1,
  "Asian Restaurant": 2,
  "European Restaurant": 3,
  "Italian Restaurant": 4,
  "Fast Food Restaurant": 5,
  "Mexican Restaurant": 6,
  "Seafood Restaurant": 7,
  "Coffee Shop": 8,
  "Sandwich Shop": 9
}

predictions = open("predictions_Category.txt", 'w')
predictions.write("userID-reviewHash,category\n")
for l in readGz("test_Category.json.gz"):
  cat = catDict['American Restaurant'] # If there's no evidence, just choose the most common category in the dataset
  words = l['reviewText'].lower()
  if 'america' in words:
    cat = catDict['American Restaurant']
  if 'bar' in words or 'beer' in words:
    cat = catDict['Bar']
  if 'asia' in words:
    cat = catDict['Asian Restaurant']
  if 'europe' in words:
    cat = catDict['European Restaurant']
  if 'italian' in words:
    cat = catDict['Italian Restaurant']
  if 'fast' in words:
    cat = catDict['Fast Food Restaurant']
  if 'mexic' in words:
    cat = catDict['Mexican Restaurant']
  if 'coffee' in words:
    cat = catDict['Coffee Shop']
  if 'sandwich' in words:
    cat = catDict['Sandwich Shop']
  predictions.write(l['userID'] + '-' + l['reviewHash'] + "," + str(cat) + "\n")

predictions.close()

In [2]:
alluser = list()
allbus = set()
added = dict()
count = 0
valid = open('valid1.txt','w')
valid2 = open('valid2.txt','w')
valid.write('userID-businessID,actual,prediction\n')
valid2.write('userID-businessID,actual,prediction\n')
for l in readGz("train.json.gz"):
    user,business = l['userID'],l['businessID']
    if user in alluser:
        added[user].append(business)
    else:
        added[user]= list()
        added[user].append(business)
        alluser.append(user)
    allbus.add(business)
    if count >= 100000 and count<200000:
        valid.write(user + '-' + business +',1'+"\n")
    count +=1
for i in range(100000):
    u = i%len(alluser)
    user = alluser[u]
    diffbus = allbus.difference(added[user])
    if len(diffbus)>0:
        newb = diffbus.pop()
        added[user].append(newb)
        valid2.write(user + '-' + newb +",0" +"\n")
valid.close()
valid2.close()

In [3]:
print 'Question 1:'
businessCount = defaultdict(int)
totalPurchases = 0
for l in readGz("train.json.gz"):
    user,business = l['userID'],l['businessID']
    businessCount[business] += 1
    totalPurchases += 1
    if totalPurchases >= 100000:
        break
mostPopular = [(businessCount[x], x) for x in businessCount]
mostPopular.sort()
mostPopular.reverse()
return1 = set()
count = 0
for ic, i in mostPopular:    
    count += ic
    return1.add(i)
    if count > totalPurchases/2: 
        break
samples = float(0)
correct = float(0)
for l in open("valid1.txt"):
    samples += 1
    if l.startswith("userID"):
        #header
        samples -= 1 
        continue
    u,i = l.strip().split('-')
    i = i.split(',')[0]
    if i in return1:
        correct += 1      
for l in open("valid2.txt"):
    samples += 1
    if l.startswith("userID"):
        #header
        samples -= 1 
        continue
    u,i = l.strip().split('-')
    i = i.split(',')[0]
    if not i in return1:
        correct += 1
print 'accuracy: ', correct/samples



Question 1:
accuracy:  0.503035


In [4]:
print 'Question 2:'
businessCount = defaultdict(int)
totalPurchases = 0
for l in readGz("train.json.gz"):
    user,business = l['userID'],l['businessID']
    businessCount[business] += 1
    totalPurchases += 1
    if totalPurchases >= 100000:
        break
mostPopular = [(businessCount[x], x) for x in businessCount]
mostPopular.sort()
mostPopular.reverse()
return1 = set()
count = 0
for ic, i in mostPopular:    
    count += ic
    return1.add(i)
    if count > totalPurchases*(0.23): 
        break
samples = float(0)
correct = float(0)
for l in open("valid1.txt"):
    samples += 1
    if l.startswith("userID"):
        #header
        samples -= 1 
        continue
    u,i = l.strip().split('-')
    i = i.split(',')[0]
    if i in return1:
        correct += 1      
for l in open("valid2.txt"):
    samples += 1
    if l.startswith("userID"):
        #header
        samples -= 1 
        continue
    u,i = l.strip().split('-')
    i = i.split(',')[0]
    if not i in return1:
        correct += 1
print 'accuracy: ', correct/samples
print 'using threhold of the 23rd percentile of populatity has better accuracy'

Question 2:
accuracy:  0.605045
using threhold of the 23rd percentile of populatity has better accuracy


In [5]:
print 'Question 3:'
print 'revisit baseline '
def revisit_baseline(userID,busID,allusercats,allbuscats):
    if not userID in allusercats.keys():
        return 0
    if not busID in allbuscats.keys():
        return 0
    usercats = allusercats[userID]
    buscats = allbuscats[busID]
    for bc in buscats:
        if bc in usercats:
            return 1
    return 0
    

Question 3:
revisit baseline 


In [6]:
print 'Question 4:'
allusercats = dict()
allbuscats = dict()
for l in readGz("train.json.gz"):
    user,bus = l['userID'],l['businessID']
    if not user in allusercats.keys():
        allusercats[user] = set()
    if not bus in allbuscats.keys():
        allbuscats[bus] = set()
    for c in l['categories']:
        allusercats[user].add(c)
        allbuscats[bus].add(c)
predictions = open("predictions_revisit.txt", 'w')
for l in open("pairs_Visit.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if revisit_baseline(u,i,allusercats,allbuscats):
        predictions.write(u + '-' + i + ",1\n")
    else:
        predictions.write(u + '-' + i + ",0\n")
predictions.close()
print 'user name: ah'
print 'score: 0.66490'
print 'email: kyhor@ucsd.edu'

Question 4:
user name: ah
score: 0.66490
email: kyhor@ucsd.edu


In [7]:
print 'prepare data for Question 5:'
def mostpop(cidcounts):
    mostpop = list()
    for i in range(len(cidcount)):
        mostpop.append((cidcount[i],i))
    mostpop = sorted(mostpop)
    return mostpop

with_cid = list()
for l in readGz('train.json.gz'):
    if 'categoryID' in l.keys():
        with_cid.append(l)
train = with_cid[:len(with_cid)/2]
valid = with_cid[len(with_cid)/2:]
cidcount = [0,0,0,0,0,0,0,0,0,0] 
for t in train:
    cidcount[int(t['categoryID'])] += 1
 
most_popular = mostpop(cidcount)
poprank = dict()
for i in range(1,11):
    poprank[most_popular[-i][1]] = i


prepare data for Question 5:


In [8]:
print 'prepare data for Question 5:'
train_user_cids = dict()
predict_visit = dict()
for t in train:
    u,c = t['userID'],t['categoryID']
    if not u in train_user_cids.keys():
        train_user_cids[u] = [0,0,0,0,0,0,0,0,0,0]
    train_user_cids[u][int(c)] += 1
    
for u in train_user_cids.keys():
    mp = mostpop(train_user_cids[u])
    if mp[-1][0] != mp[-2][0]:
        predict_visit[u] = mp[-1][1]
    else:
        if poprank[mp[-1][1]] < poprank[mp[-2][1]]:
            predict_visit[u] = mp[-1][1]
        else:
            predict_visit[u] = mp[-2][1]    
    
        

prepare data for Question 5:


In [9]:
print 'continue Question 5:'
samples = float(0)
correct = float(0)
nosee = 0
for t in valid:
    samples += 1
    u,c  = t['userID'],t['categoryID']
    if not u in predict_visit.keys():
        predict_cid  = 0
        nosee +=1
    else:
        predict_cid = predict_visit[u]
    if int(c) == int(predict_cid):
        correct += 1
print 'accuracy: ',correct/samples

continue Question 5:
accuracy:  0.303265143313


In [10]:
import string
print 'Question 6:'

def feq_500_word_in_cat(cat):
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for t in train:
        if int(t['categoryID']) == int(cat):
            r = ''.join([c for c in t['reviewText'].lower() if not c in punctuation])
            for w in r.split():
                wordCount[w] += 1
    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()
    words500 = [x[1] for x in counts[:500]]
    counts500 = [x[0] for x in counts[:500]]
    return counts[:500],wordCount

def more_feq_in(cat,wordCount,total_count_500):
    print 'in category ',cat
    morefq = list()
    ci,wc = feq_500_word_in_cat(cat)
    wi_500 =  [x[1] for x in ci[:500]]
    ci_500 = [x[0] for x in ci[:500]]
    ci_total500 = sum(ci_500)
    for i in range(500):
        w = wi_500[i]
        c_in_i = wc[w]
        fq_in_i = float(c_in_i)/ci_total500
        c_w_app = wordCount[w]
        fq_w = float(c_w_app)/total_count_500
        morefq.append((fq_in_i - fq_w ,w))
    morefq.sort()
    morefq.reverse()
    print 'the 10 most freq words compare to other category:'
    print morefq[:10]


wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for t in train:
    r = ''.join([c for c in t['reviewText'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words500 = [x[1] for x in counts[:500]]
counts500 = [x[0] for x in counts[:500]]
total_count_500 = sum(counts500)

for i in range(10):
    more_feq_in(i,wordCount,total_count_500)


Question 6:
in category  0
the 10 most freq words compare to other category:
[(0.002175286194542249, u'was'), (0.001500068333356113, u'brunch'), (0.0013399477708666598, u'food'), (0.0011746499194647225, u'breakfast'), (0.0011005589218836542, u'the'), (0.0009329331391129273, u'menu'), (0.0009089905274235052, u'had'), (0.000886761295411112, u'we'), (0.0008312637692096166, u'service'), (0.0007225987727119151, u'cheese')]
in category  1
the 10 most freq words compare to other category:
[(0.00701462999249837, u'a'), (0.006821227082725281, u'bar'), (0.004088571612044616, u'beer'), (0.003733732714768682, u'drinks'), (0.003436296934870655, u'to'), (0.0023448711410517186, u'music'), (0.0022085462291689412, u'drink'), (0.002184483474731238, u'place'), (0.0019377070246698307, u'great'), (0.0019194253737134272, u'on')]
in category  2
the 10 most freq words compare to other category:
[(0.0036505782016691274, u'sushi'), (0.0034279808752587784, u'thai'), (0.00283088925374733, u'food'), (0.00188192303

In [11]:
from sklearn import svm
from collections import defaultdict
import string

print 'Question 7:'

def featQ7 (t,word500Pos):
    feat = [0]*500
    punctuation = set(string.punctuation)
    r = ''.join([c for c in t['reviewText'].lower() if not c in punctuation])
    for w in r.split():
        feat[word500Pos[w]] = 1
    return feat

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for t in train:
    r = ''.join([c for c in t['reviewText'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words500 = [x[1] for x in counts[:500]]
word500Pos = defaultdict(int)
for i in range(len(words500)):
    word500Pos[words500[i]] = i
X_train = [featQ7(t,word500Pos) for t in train]
X_valid = [featQ7(t,word500Pos) for t in valid]
y_train = [ 1 == int(t['categoryID'])for t in train]
y_valid = [ int(t['categoryID']) for t in valid]
bestc = -1
bestacc = -1
print 'data is ready'
for c in [0.01,0.1,1,10,100]:    
    clf = svm.SVC(C=c)
    clf.fit(X_train, y_train)
    valid_pred = clf.predict(X_valid)
    acc = [(pre and val) for (pre,val) in zip(valid_pred,y_valid)]
    correct = sum (acc)
    accuracy = float(correct)/len(acc)
    print 'c = ',c,' with accuracy: ', accuracy
    if accuracy > bestacc:
        bestacc = accuracy
        bestc = c
print 'c = ',bestc,' has highest accuracy:',bestacc

Question 7:
data is ready
c =  0.01  with accuracy:  0.0
c =  0.1  with accuracy:  0.0
c =  1  with accuracy:  2.8491651946e-05
c =  10  with accuracy:  0.0359849564078


KeyboardInterrupt: 

In [None]:
print 'Question 8:'

def featQ8(t,word500Pos):
    feat = [0]*500
    punctuation = set(string.punctuation)
    r = ''.join([c for c in t['reviewText'].lower() if not c in punctuation])
    for w in r.split():
        feat[word500Pos[w]] = 1
    return feat

def train_catsvm(cat,train,c,X_train):
    y_train = [int(t['categoryID']) == int(cat) for t in train]
    clf = svm.SVC(C=c)
    clf.fit(X_train,y_train)
    return clf

def find_best_c_for_catsvm(tarin,valid,cat,X_train,X_valid):
    y_valid = [int(cat) == int(t['categoryID'])for t in valid]
     #y_valid = [ int(t['categoryID'])for t in valid]
    bestc = -1
    bestacc = -1
    for c in [0.01,0.1,1,10,100]:    
        clf = train_catsvm(cat,train,c,X_train)
        valid_pred = clf.predict(X_valid)
        acc = [(pre == val) for (pre,val) in zip(valid_pred,y_valid)]
        correct = sum (acc)
        accuracy = float(correct)/len(acc)
        if accuracy > bestacc:
            bestacc = accuracy
            bestc = c
            
    return bestc

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for t in train:
    r = ''.join([c for c in t['reviewText'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
word500 = [x[1] for x in counts[:500]]
word500Pos = defaultdict(int)
for i in range(len(words500)):
    word500Pos[words500[i]] = i
X_train = [featQ8(t,word500Pos) for t in train]
X_valid = [featQ8(t,word500Pos) for t in valid]
y_valid_cid = [int(rev['categoryID']) for rev in valid]
best_c = [find_best_c_for_catsvm(train,valid,cat,X_train,X_valid) for cat in range(10)]
best_catsvm = [train_catsvm(cat,train,best_c[cat],X_train)for cat in range(10)]
print 'Data ready'
correct = flaot(0)
samples = len(y_valid_cid) 
for i in len(y_valid):
    best_score = -1
    best_guess = -1
    for cat in range(10):
        clf = best_catsvm[cat]
        score = clf.decision_function(X_valid[i])
        if score > best_score:
            best_score = score
            best_guess = cat
    if y_valid_cid[i] == best_guess:
        correct += 1
print 'Accuracy: ',correct/samples
    

Question 8:
