In [490]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.core.display import HTML
import matplotlib.pyplot as plt

In [491]:
#reading the abstracts in to a Dataframes
mfgDf = pd.DataFrame.from_csv('../../data/positive_data.csv', index_col=None)
nsfDf = pd.DataFrame.from_csv('../../data/negative_data.csv', index_col=None)

In [492]:
#draw samples of 1000 from MFG and NON-MFG and add the flags
mfgDfSample = mfgDf.sample(1000)
mfgDfSample["sample_type"] = "MFG"

nsfDfSample = nsfDf.sample(1000)
nsfDfSample["sample_type"] = "NON-MFG"

In [493]:
#merging two dataframes to get a master data frame
#from which we will draw train and test samples
df = mfgDfSample.append(nsfDfSample)
df["txt"] = df["title"] + " " + df["abstract"]

In [494]:
#creating training and test hold out set
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(df['txt'].values.astype('U'), 
                                              (df["sample_type"]=="MFG").values,random_state=5)

In [495]:
#count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer =  CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)

In [496]:
#creating a vect tranformer and vectorizing the training corpus
corpus = Xtrain
vectorizer = vectorizer.fit(corpus)
X = vectorizer.transform(corpus)
feature_names = np.asarray(vectorizer.get_feature_names())

In [497]:
X.shape

(1500, 230406)

In [498]:
#fitting a tfidf tranformer and tranforming the vector
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf_transformer = transformer.fit(X.toarray())
tfidf = tfidf_transformer.transform(X.toarray())

In [499]:
#training X and y
X_tfidf = tfidf.toarray()
y_target = ytrain

#hold out/test X and y
X_test_tfidf = tfidf_transformer.transform(vectorizer.transform(Xtest).toarray()).toarray()
y_test_target = ytest

In [500]:
## Dimensionality Reduction ### Is this the right way for dimensionality reduction?

In [501]:
#dimensionality reduction using Univariate statistics/selectKBest chi2
#http://scikit-learn.org/stable/modules/feature_selection.html
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
skb = SelectKBest(chi2, k=4000)

#creating training and test X for dimensionaly reduced data
X_dr_train = skb.fit_transform(X_tfidf, y_target)
X_dr_test = skb.transform(X_test_tfidf)

In [502]:
feature_names_dr = []
feature_names_dr = [ feature_names[i] for i in skb.get_support(indices=True)] 

In [503]:
#performing logistic regression on the data
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_tfidf,ytrain)
from sklearn.metrics import accuracy_score
#calculating accuracy of the classifier
print(accuracy_score(clf.predict(X_test_tfidf),y_test_target))

0.862


In [504]:
#performing logistic regression on the dimensionally reduced data
from sklearn.linear_model import LogisticRegression
clf_dr = LogisticRegression()
clf_dr.fit(X_dr_train,y_target)
#calculating accuracy of the classifier
print(accuracy_score(clf_dr.predict(X_dr_test),y_test_target))

0.854


In [505]:
############################################################################
### Using Multinomial classifier
############################################################################

In [506]:
#performing multinomial NB on data
from sklearn.naive_bayes import MultinomialNB
clf_mn = MultinomialNB()
clf_mn.fit(X_tfidf, y_target)
accuracy_score(clf_mn.predict(X_test_tfidf), y_test_target)

0.85399999999999998

In [507]:
#performing multinomial NB on reduced data
from sklearn.naive_bayes import MultinomialNB
clf_mn_dr = MultinomialNB()
clf_mn_dr.fit(X_dr_train, y_target)
print(accuracy_score(clf_mn_dr.predict(X_dr_test),y_test_target))

0.874


In [508]:
############################################################################
### Using SGD classifier
############################################################################

In [509]:
#performing SGDClassifier on data
from sklearn.linear_model import SGDClassifier
clf_sgd = SGDClassifier()
clf_sgd.fit(X_tfidf, y_target)
accuracy_score(clf_sgd.predict(X_test_tfidf), y_test_target)

0.874

In [510]:
#performing SGDClassifier on reduced data
from sklearn.linear_model import SGDClassifier
clf_sgd_dr = SGDClassifier()
clf_sgd_dr.fit(X_dr_train, y_target)
accuracy_score(clf_sgd_dr.predict(X_dr_test), y_test_target)

0.82199999999999995

In [511]:
############################################################################
### trying the pipeline using Multinomial classifier
############################################################################

In [512]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
clf_mn_pl = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])
clf_mn_pl = clf_mn_pl.fit(Xtrain, ytrain)
print(accuracy_score(clf_mn_pl.predict(Xtest),ytest))

0.846


In [513]:
#with dimensionality reduction????????? Not sure if this is the right way to pass parameters.
clf_mn_pl_dr = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])

'''ps = {
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1))  # unigrams or bigrams    
    }
    '''
#clf_mn_pl_dr.set_params(vect__max_features=10000)
clf_mn_pl_dr = clf_mn_pl_dr.set_params(vect__max_features=10000).fit(Xtrain, ytrain)
print(accuracy_score(clf_mn_pl_dr.predict(Xtest),ytest))

0.844


In [514]:
############################################################################
### trying the pipeline using SGD classifier
############################################################################

In [515]:
from sklearn.linear_model import SGDClassifier
text_clf_sg = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
 ])
text_clf_sg = text_clf_sg.fit(Xtrain, ytrain)
predicted = text_clf_sg.predict(Xtest)
np.mean(predicted == ytest)    

0.878

In [516]:
# reading the metrics
from sklearn import metrics
print(metrics.classification_report(ytest, predicted))

             precision    recall  f1-score   support

      False       0.85      0.90      0.88       242
       True       0.90      0.86      0.88       258

avg / total       0.88      0.88      0.88       500



In [517]:
## reading the confusion matrix
metrics.confusion_matrix(ytest, predicted)

array([[218,  24],
       [ 37, 221]])

In [518]:
##################################################
# Parameter tuning using grid search             #
##################################################

In [531]:
from sklearn.grid_search import GridSearchCV
'''
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
}
'''

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
}

In [532]:
gs_clf = GridSearchCV(text_clf_sg, parameters, n_jobs=-1)

In [533]:
gs_clf = gs_clf.fit(Xtrain, ytrain)

In [534]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 1e-05
clf__n_iter: 50
clf__penalty: 'elasticnet'
vect__max_df: 0.75
vect__max_features: None
vect__ngram_range: (1, 2)


In [535]:
score              

0.87133333333333329

In [539]:
gspredicted = gs_clf.predict(Xtest)
np.mean(gspredicted == ytest)  

0.88800000000000001