# Programming Assignment 4
## 605.744 Information Retrieval
### Justin Ely

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [124]:
def concat(*args):
    """ Concatenate any numer of input streams """
    items = []
    
    for item in args:
        if isinstance(item, str):
            items.append(item)
            
    return ' '.join(items)

In [47]:
#-----------------------------------------------------------------------------------------

def precision(predicted, truth, verbose=True):
    """ Calculate precision of predicted values """
    total = 0
    correct = 0
    
    for p, t in zip(predicted, truth):
        if p == 1:
            total += 1
            
            if t == 1:
                correct += 1

    if verbose:
        print("Precision {}/{} = {}".format(correct, total, correct/total))
    
    return correct/total
                
#-----------------------------------------------------------------------------------------
    
def recall(predicted, truth, verbose=True):
    """ Calculate recal of predicted values """
    total = 0
    correct = 0
    
    for p, t in zip(predicted, truth):
        if t == 1:
            total += 1
            
            if p == 1:
                correct += 1
    if verbose:
        print("Recal {}/{} = {}".format(correct, total, correct/total))
    
    return correct/total

#-----------------------------------------------------------------------------------------
        
def f1(predicted, truth):
    """ Calculate F1 score """
    p = precision(predicted, truth, False)
    r = recall(predicted, truth, False)
    
    return 2*p*r/(p+r)

#-----------------------------------------------------------------------------------------

### Data loading

Here we load the supplied data for initial training, development, and final testing.  We use the pandas library to parse the TSV files and load them each into separate dataframes

In [2]:
train = pd.read_csv('phase1.train.shuf.tsv',
                   sep='\t',
                   header=None,
                   names=["assessment", "docid", "title", "authors", "journal", "issn", "year", "language", "abstract", "keywords"])

In [3]:
dev = pd.read_csv('phase1.dev.shuf.tsv',
                   sep='\t',
                   header=None,
                   names=["assessment", "docid", "title", "authors", "journal", "issn", "year", "language", "abstract", "keywords"])

In [140]:
test = pd.read_csv('phase1.test.shuf.tsv',
                   sep='\t',
                   header=None,
                   names=["assessment", "docid", "title", "authors", "journal", "issn", "year", "language", "abstract", "keywords"])

In [49]:
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
#all_abs = [item if isinstance(item, str) else '' for item in train['title']]
#train_counts = vectorizer.fit_transform(all_abs)

In [7]:
dev_text = vectorizer.transform(dev['title'])
print(dev_text[0])

  (0, 2937)	1
  (0, 5316)	1
  (0, 7586)	1
  (0, 8059)	2
  (0, 9061)	1
  (0, 10792)	1
  (0, 11906)	1
  (0, 12570)	1


In [165]:
clf = svm.SVC().fit(train_counts, train['assessment'])
print(clf)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [168]:
preds = clf.predict(dev_text)
preds[preds>0]

array([], dtype=int64)

## Bayes

#### Title Data

In [50]:
vectorizer = CountVectorizer()
train_counts = vectorizer.fit_transform(train['title'])

In [59]:
#clf = MultinomialNB().fit(train_counts, train['assessment'])
clf = BernoulliNB(alpha=.001).fit(train_counts, train['assessment'])

BernoulliNB(alpha=0.001, binarize=0.0, class_prior=None, fit_prior=True)


In [60]:
preds = clf.predict(train_counts)

recall(preds, train['assessment'])
precision(preds, train['assessment'])
f1(preds, train['assessment'])

Recal 633/695 = 0.9107913669064748
Precision 633/1069 = 0.5921421889616464


0.717687074829932

In [80]:
dev_text = vectorizer.transform(dev['title'])
preds = clf.predict(dev_text)

recall(preds, dev['assessment'])
precision(preds, dev['assessment'])
f1(preds, dev['assessment'])

Recal 25/150 = 0.16666666666666666
Precision 25/39 = 0.6410256410256411


0.2645502645502646

In [181]:
clf.predict(vectorizer.transform([train.iloc[133]['abstract']]))

array([-1])

### Title + Abstract + Keywords

In [133]:
train_data = [concat(*item) for item in zip(train['title'], train['abstract'], train['keywords'])]

vectorizer = CountVectorizer()
train_counts = vectorizer.fit_transform(train_data)

clf = BernoulliNB(alpha=.001).fit(train_counts, train['assessment'])

In [134]:
train_data = [concat(*item) for item in zip(dev['title'] + dev['abstract'] + dev['keywords'])]
dev_text = vectorizer.transform(train_data)
preds = clf.predict(dev_text)

recall(preds, dev['assessment'])
precision(preds, dev['assessment'])
f1(preds, dev['assessment'])

Recal 61/150 = 0.4066666666666667
Precision 61/158 = 0.3860759493670886


0.3961038961038961

## SVM

In [86]:
vectorizer = CountVectorizer()
train_counts = vectorizer.fit_transform(train['title'])

#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
#all_abs = [item if isinstance(item, str) else '' for item in train['title']]
#train_counts = vectorizer.fit_transform(train['title'])

In [96]:
clf = svm.LinearSVC(C=1).fit(train_counts, train['assessment'])
print(clf)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [97]:
preds = clf.predict(train_counts)

recall(preds, train['assessment'])
precision(preds, train['assessment'])
f1(preds, train['assessment'])

Recal 605/695 = 0.8705035971223022
Precision 605/616 = 0.9821428571428571


0.9229595728451564

In [98]:
dev_text = vectorizer.transform(dev['title'])
preds = clf.predict(dev_text)

recall(preds, dev['assessment'])
precision(preds, dev['assessment'])
f1(preds, dev['assessment'])

Recal 41/150 = 0.2733333333333333
Precision 41/110 = 0.37272727272727274


0.3153846153846154