In [70]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.core.display import HTML
import matplotlib.pyplot as plt

In [71]:
#reading the abstracts in to a Dataframes
mfgDf = pd.DataFrame.from_csv('../../data/positive_data.csv', index_col=None)
nsfDf = pd.DataFrame.from_csv('../../data/negative_data.csv', index_col=None)

In [72]:
len(nsfDf)

29347

In [73]:
#draw samples of 1000 from MFG and NON-MFG and add the flags
mfgDfSample = mfgDf.sample(1500)
mfgDfSample["sample_type"] = "MFG"

nsfDfSample = nsfDf.sample(1500)
nsfDfSample["sample_type"] = "NON-MFG"

In [74]:
#merging two dataframes to get a master data frame
#from which we will draw train and test samples
df = mfgDfSample.append(nsfDfSample)
df["txt"] = df["title"]
#+ " " + df["abstract"]

In [75]:
df.head(3)

Unnamed: 0,id,title,abstract,event_name,event_year,sample_type,txt
2572,NSF_20150101_1545641,Student Travel Fellowships for the 2015 IEEE I...,"""Big Data"" has emerged as a new approach to co...",NSF,2015.0,MFG,Student Travel Fellowships for the 2015 IEEE I...
3567,CIRP_20090101_2009-58-1-0053-C,6-Axis control ultraprecision microgrooving on...,New optical devices with multi-functions such ...,CIRP,2009.0,MFG,6-Axis control ultraprecision microgrooving on...
310,NSF_20130101_1314287,SBIR Phase I: Breast Cancer Risk Test for Atyp...,This Small Business Innovation Research (SBIR)...,NSF,2013.0,MFG,SBIR Phase I: Breast Cancer Risk Test for Atyp...


In [76]:
#training and test hold out set
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(df['txt'].values.astype('U'), 
                                              (df["sample_type"]=="MFG").values,random_state=5)

In [77]:
#count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer =  CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\b\\w+\\b', tokenizer=None,
        vocabulary=None)

In [78]:
# vectorizing the training corpus
corpus = Xtrain
X = vectorizer.fit_transform(corpus)

In [79]:
X.shape

(2250, 23498)

In [80]:
#vectorizer.vocabulary_.get(u'finite')

In [81]:
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer   

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [82]:
tfidf = transformer.fit_transform(X.toarray())

In [83]:
X_tfidf = tfidf.toarray()

In [84]:
X_tfidf

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [85]:
y_target = ytrain

In [86]:
#dimensionality reduction using Univariate statistics/selectKBest chi2
#http://scikit-learn.org/stable/modules/feature_selection.html
#??? IS THIS NECESSARY? Can the pipeline take care of this?
#??? IF I do this, how do I know the same features are selected in test or real life text?
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import chi2
#X_new = SelectKBest(chi2, k=15).fit_transform(X_tfidf, y_target)
#X_new.shape

In [87]:
############################################################################
### trying the pipeline using Multinomial classifier
############################################################################

In [88]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])

In [89]:
text_clf = text_clf.fit(Xtrain, ytrain)

In [90]:
###evaluating prediction performance

In [91]:
import numpy as np

In [92]:
predicted = text_clf.predict(Xtest)

In [93]:
np.mean(predicted == ytest)

0.82399999999999995

In [94]:
############################################################################
### trying the pipeline using SGD classifier
############################################################################

In [95]:
from sklearn.linear_model import SGDClassifier
text_clf_sg = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
 ])

In [96]:
text_clf_sg = text_clf_sg.fit(Xtrain, ytrain)
predicted = text_clf_sg.predict(Xtest)
np.mean(predicted == ytest)    

0.83066666666666666

In [97]:
# reding the metrics
from sklearn import metrics
print(metrics.classification_report(ytest, predicted))

             precision    recall  f1-score   support

      False       0.82      0.83      0.83       367
       True       0.84      0.83      0.83       383

avg / total       0.83      0.83      0.83       750



In [98]:
## reading the confusion matrix
metrics.confusion_matrix(ytest, predicted)

array([[306,  61],
       [ 66, 317]])

In [99]:
##################################################
# Parameter tuning using grid search             #
##################################################

In [100]:
from sklearn.grid_search import GridSearchCV
'''
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
}
'''

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
}

In [101]:
gs_clf = GridSearchCV(text_clf_sg, parameters, n_jobs=-1)

In [102]:
gs_clf = gs_clf.fit(Xtrain, ytrain)

In [103]:
gs_clf.predict(["he police chief said the video did not definitively show the victim pointing a gun, but said it supported the police’s account of events."])

array([False], dtype=bool)

In [104]:
###This doenst look right
gs_clf.predict(["does googling help you find more things"])

array([False], dtype=bool)

In [105]:
gs_clf.predict(["Trajectory generation and control of a 9 axis CNC micromachining center"])

array([ True], dtype=bool)

In [106]:
gs_clf.predict(["Process simulation integrated tool axis selection for 5-axis tool path generation"])

array([ True], dtype=bool)

In [107]:
gs_clf.predict(["The NFL joins the data revolution in sports"])

array([False], dtype=bool)

In [108]:
gs_clf.predict(["Better person-machine communication designed to help prevent accidents"])

array([ True], dtype=bool)

In [109]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 1e-05
clf__n_iter: 50
clf__penalty: 'l2'
tfidf__norm: 'l2'
tfidf__use_idf: True
vect__max_df: 0.5
vect__max_features: None
vect__ngram_range: (1, 2)


In [110]:
gs_clf.predict(["Holographic imaging and deep learning diagnose malaria"])

array([ True], dtype=bool)

In [111]:
gs_clf.predict(["Doctor: Hillary Clinton 'Healthy and Fit to Serve as President'"])

array([False], dtype=bool)

In [112]:
gs_clf.predict(["Multi-objective allocation of customized orders to production-line networks"])

array([ True], dtype=bool)

In [118]:
score              

0.82222222222222219

In [114]:
gs_clf.predict(["The Internet of Things Will Make Manufacturing Smarter"])

array([ True], dtype=bool)

In [128]:
gs_clf.predict(["A method of uniform stratification of risk for evaluating the results of surgery in acquired adult heart disease."])

array([ True], dtype=bool)

In [125]:
gs_clf.predict(["Adult open heart surgery in New York State: an analysis of risk factors and hospital mortality rates"])

array([False], dtype=bool)

In [117]:
gs_clf.predict(["The beauty of fractals: images of complex dynamical systems"])

array([ True], dtype=bool)

In [122]:
gspredicted = gs_clf.predict(Xtest)
np.mean(gspredicted == ytest)  

0.83333333333333337

In [130]:
gs_clf.predict(["Developing and evaluating complex interventions: the new Medical Research Council guidance"])

array([False], dtype=bool)

In [131]:
gs_clf.predict(["Lathe Operations Types and Cutting Tools"])

array([ True], dtype=bool)