In [272]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.core.display import HTML
import matplotlib.pyplot as plt

In [273]:
#reading the abstracts in to a Dataframes
mfgDf = pd.DataFrame.from_csv('../../data/positive_data.csv', index_col=None)
nsfDf = pd.DataFrame.from_csv('../../data/negative_data.csv', index_col=None)

In [274]:
len(nsfDf)

29347

In [275]:
#draw samples of 1000 from MFG and NON-MFG and add the flags
mfgDfSample = mfgDf.sample(1500)
mfgDfSample["sample_type"] = "MFG"

nsfDfSample = nsfDf.sample(1500)
nsfDfSample["sample_type"] = "NON-MFG"

In [276]:
#merging two dataframes to get a master data frame
#from which we will draw train and test samples
df = mfgDfSample.append(nsfDfSample)
df["txt"] = df["title"] + " " + df["abstract"]

In [277]:
df.head(3)

Unnamed: 0,id,title,abstract,event_name,event_year,sample_type,txt
567,NSF_20130101_1331989,I/UCRC FRP: Collaborative Research: The Physic...,Program Director's Recommendation <br/>Center ...,NSF,2013.0,MFG,I/UCRC FRP: Collaborative Research: The Physic...
785,NSF_20130101_1344201,INSPIRE Track 1: UDiscoverIt: Integrating Expe...,This INSPIRE award is partially funded by the ...,NSF,2013.0,MFG,INSPIRE Track 1: UDiscoverIt: Integrating Expe...
3244,NSF_20150101_1551138,A Vision-Based Technique for Damage Assessment...,Structural health monitoring is an important s...,NSF,2015.0,MFG,A Vision-Based Technique for Damage Assessment...


In [278]:
#training and test hold out set
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(df['txt'].values.astype('U'), 
                                              (df["sample_type"]=="MFG").values,random_state=5)

In [279]:
#count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer =  CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\b\\w+\\b', tokenizer=None,
        vocabulary=None)

In [290]:
# vectorizing the training corpus
corpus = Xtrain
X = vectorizer.fit_transform(corpus)

In [291]:
X.shape

(2250, 306110)

In [282]:
#vectorizer.vocabulary_.get(u'finite')

In [294]:
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer   

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [295]:
tfidf = transformer.fit_transform(X.toarray())

In [296]:
X_tfidf = tfidf.toarray()

In [297]:
X_tfidf

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [298]:
y_target = ytrain

In [299]:
X_tfidf.shape

(2250, 306110)

In [300]:
ytrain.shape

(2250,)

In [301]:
#LOGIT
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_tfidf,ytrain)
#print(accuracy_score(clf.predict(transformer.fit_transform(vectorizer.fit_transform(Xtest).toarray()).toarray()),ytest))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [307]:
test_vect = vectorizer.fit_transform(Xtest)

In [311]:
test_tfidf = transformer.fit_transform(test_vect.toarray())
x_test_tfidf = test_tfidf.toarray()

In [320]:
x_test_tfidf.shape

(750, 136116)

In [321]:
from sklearn.metrics import accuracy_score
#accuracy_score(clf.predict(x_test_tfidf),ytest)
## The number of features in the train doesnt match train
## Is this why we need dimensionality reduction

In [197]:
#dimensionality reduction using Univariate statistics/selectKBest chi2
#http://scikit-learn.org/stable/modules/feature_selection.html
#??? IS THIS NECESSARY? Can the pipeline take care of this?
#??? IF I do this, how do I know the same features are selected in test or real life text?
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import chi2
#X_new = SelectKBest(chi2, k=15).fit_transform(X_tfidf, y_target)
#X_new.shape

In [198]:
############################################################################
### trying the pipeline using Multinomial classifier
############################################################################

In [199]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])

In [200]:
text_clf = text_clf.fit(Xtrain, ytrain)

In [201]:
###evaluating prediction performance

In [202]:
import numpy as np

In [203]:
predicted = text_clf.predict(Xtest)

In [204]:
np.mean(predicted == ytest)

0.85733333333333328

In [205]:
############################################################################
### trying the pipeline using SGD classifier
############################################################################

In [206]:
from sklearn.linear_model import SGDClassifier
text_clf_sg = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
 ])

In [218]:
text_clf_sg = text_clf_sg.fit(Xtrain, ytrain)
predicted = text_clf_sg.predict(Xtest)
np.mean(predicted == ytest)    

0.86266666666666669

In [219]:
# reding the metrics
from sklearn import metrics
print(metrics.classification_report(ytest, predicted))

             precision    recall  f1-score   support

      False       0.92      0.78      0.85       367
       True       0.82      0.94      0.87       383

avg / total       0.87      0.86      0.86       750



In [221]:
## reading the confusion matrix
metrics.confusion_matrix(ytest, predicted)

array([[288,  79],
       [ 24, 359]])

In [222]:
##################################################
# Parameter tuning using grid search             #
##################################################

In [245]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
}

In [253]:
gs_clf = GridSearchCV(text_clf_sg, parameters, n_jobs=-1)

In [255]:
gs_clf = gs_clf.fit(Xtrain, ytrain)

In [256]:
gs_clf.predict(["he police chief said the video did not definitively show the victim pointing a gun, but said it supported the police’s account of events."])

array([ True], dtype=bool)

In [259]:
###This doenst look right
gs_clf.predict(["ddadfa asdd"])

array([ True], dtype=bool)

In [265]:
gs_clf.predict(["Holographic imaging and deep learning diagnose malaria"])

array([ True], dtype=bool)

In [266]:
gs_clf.predict(["Doctor: Hillary Clinton 'Healthy and Fit to Serve as President'"])

array([ True], dtype=bool)

In [260]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [261]:
score              

0.88177777777777777