In [11]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.core.display import HTML
import matplotlib.pyplot as plt

In [12]:
#reading the abstracts in to a Dataframes
mfgDf = pd.DataFrame.from_csv('../../data/positive_data.csv', index_col=None)
nsfDf = pd.DataFrame.from_csv('../../data/negative_data.csv', index_col=None)

In [13]:
len(nsfDf)

29347

In [14]:
#draw samples of 1000 from MFG and NON-MFG and add the flags
mfgDfSample = mfgDf.sample(1500)
mfgDfSample["sample_type"] = "MFG"

nsfDfSample = nsfDf.sample(1500)
nsfDfSample["sample_type"] = "NON-MFG"

In [15]:
#merging two dataframes to get a master data frame
#from which we will draw train and test samples
df = mfgDfSample.append(nsfDfSample)
df["txt"] = df["title"]
#+ " " + df["abstract"]

In [16]:
df.head(3)

Unnamed: 0,id,title,abstract,event_name,event_year,sample_type,txt
3895,CIRP_20110101_2011-60-1-0207-Dn,Early design verification of complex assembly ...,"Design verification in the digital domain, usi...",CIRP,2011.0,MFG,Early design verification of complex assembly ...
3908,CIRP_20110101_2011-60-1-0259-E,Investigations on heat regulation of additive ...,By using additive manufacturing processes vari...,CIRP,2011.0,MFG,Investigations on heat regulation of additive ...
2573,NSF_20150101_1503177,CAREER: Cooperative Motion Planning for Human-...,This proposal outlines a research and educatio...,NSF,2015.0,MFG,CAREER: Cooperative Motion Planning for Human-...


In [17]:
#training and test hold out set
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(df['txt'].values.astype('U'), 
                                              (df["sample_type"]=="MFG").values,random_state=5)

In [18]:
#count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer =  CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\b\\w+\\b', tokenizer=None,
        vocabulary=None)

In [19]:
# vectorizing the training corpus
corpus = Xtrain
X = vectorizer.fit_transform(corpus)

In [20]:
X.shape

(2250, 23776)

In [21]:
#vectorizer.vocabulary_.get(u'finite')

In [22]:
#TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer   

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [23]:
tfidf = transformer.fit_transform(X.toarray())

In [24]:
X_tfidf = tfidf.toarray()

In [25]:
X_tfidf

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [26]:
y_target = ytrain

In [27]:
#dimensionality reduction using Univariate statistics/selectKBest chi2
#http://scikit-learn.org/stable/modules/feature_selection.html
#??? IS THIS NECESSARY? Can the pipeline take care of this?
#??? IF I do this, how do I know the same features are selected in test or real life text?
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import chi2
#X_new = SelectKBest(chi2, k=15).fit_transform(X_tfidf, y_target)
#X_new.shape

In [28]:
############################################################################
### trying the pipeline using Multinomial classifier
############################################################################

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])

In [30]:
text_clf = text_clf.fit(Xtrain, ytrain)

In [31]:
###evaluating prediction performance

In [32]:
import numpy as np

In [33]:
predicted = text_clf.predict(Xtest)

In [34]:
np.mean(predicted == ytest)

0.80133333333333334

In [35]:
############################################################################
### trying the pipeline using SGD classifier
############################################################################

In [36]:
from sklearn.linear_model import SGDClassifier
text_clf_sg = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
 ])

In [37]:
text_clf_sg = text_clf_sg.fit(Xtrain, ytrain)
predicted = text_clf_sg.predict(Xtest)
np.mean(predicted == ytest)    

0.80933333333333335

In [38]:
# reding the metrics
from sklearn import metrics
print(metrics.classification_report(ytest, predicted))

             precision    recall  f1-score   support

      False       0.81      0.80      0.80       367
       True       0.81      0.81      0.81       383

avg / total       0.81      0.81      0.81       750



In [39]:
## reading the confusion matrix
metrics.confusion_matrix(ytest, predicted)

array([[295,  72],
       [ 71, 312]])

In [40]:
##################################################
# Parameter tuning using grid search             #
##################################################

In [41]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
}

In [42]:
gs_clf = GridSearchCV(text_clf_sg, parameters, n_jobs=-1)

In [43]:
gs_clf = gs_clf.fit(Xtrain, ytrain)

In [55]:
gs_clf.predict(["he police chief said the video did not definitively show the victim pointing a gun, but said it supported the police’s account of events."])

array([False], dtype=bool)

In [56]:
###This doenst look right
gs_clf.predict(["does googling help you find more things"])

array([False], dtype=bool)

In [57]:
gs_clf.predict(["Trajectory generation and control of a 9 axis CNC micromachining center"])

array([ True], dtype=bool)

In [58]:
gs_clf.predict(["Process simulation integrated tool axis selection for 5-axis tool path generation"])

array([ True], dtype=bool)

In [59]:
gs_clf.predict(["The NFL joins the data revolution in sports"])

array([False], dtype=bool)

In [60]:
gs_clf.predict(["Better person-machine communication designed to help prevent accidents"])

array([ True], dtype=bool)

In [46]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [64]:
gs_clf.predict(["Holographic imaging and deep learning diagnose malaria"])

array([False], dtype=bool)

In [65]:
gs_clf.predict(["Doctor: Hillary Clinton 'Healthy and Fit to Serve as President'"])

array([False], dtype=bool)

In [69]:
gs_clf.predict(["Multi-objective allocation of customized orders to production-line networks"])

array([ True], dtype=bool)

In [47]:
score              

0.81999999999999995