In [113]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from util import DenseTransformer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
df = pd.read_csv('./data/dataset.csv', index_col=0, parse_dates=True)

In [13]:
df.head()

Unnamed: 0,close,d,date,l,pct_change,pt,t
2006-01-03,1268.8,1/3/2006,2006-01-03 00:00:00,/newsevents/pressreleases/orders20060103a.htm,-0.003659,Orders on Banking Applications,Approval of proposal by Kreditanstalt f&uuml;r...
2006-01-04,1273.46,1/3/2006 2:00:00 PM,2006-01-03 14:00:00,/newsevents/pressreleases/monetary20060103a.htm,-1.6e-05,Monetary Policy,"Minutes of Federal Open Market Committee, Dece..."
2006-01-10,1289.6899,1/9/2006 11:30:00 AM,2006-01-09 11:30:00,/newsevents/pressreleases/other20060109a.htm,-0.00347,Other Announcements,Consumer Advisory Council appointments for 2006
2006-01-10,1289.6899,1/10/2006,2006-01-10 00:00:00,/newsevents/pressreleases/other20060110a.htm,-0.00347,Other Announcements,Reserve Bank income and expense data and trans...
2006-01-11,1294.1801,1/10/2006 2:00:00 PM,2006-01-10 14:00:00,/newsevents/pressreleases/monetary20060110a.htm,0.006314,Monetary Policy,"Minutes of Board discount rate meetings, Decem..."


In [14]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df.t)

In [15]:
X_train_counts

<3373x3166 sparse matrix of type '<class 'numpy.int64'>'
	with 40256 stored elements in Compressed Sparse Row format>

In [33]:
tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tf_transformer.transform(X_train_counts).toarray()
X_train_tfidf.shape

(3373, 3166)

In [34]:
y = (df['pct_change'] > 0).astype(int)

In [35]:
from sklearn.naive_bayes import GaussianNB

In [36]:
clf = GaussianNB().fit(X_train_tfidf, y)

In [38]:
y_pred = clf.predict(X_train_tfidf)

In [52]:
np.mean(y_pred == y)

0.7130151200711533

In [53]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve

In [56]:
precision_score(y, y_pred)

0.6148566814695196

In [57]:
recall_score(y, y_pred)

0.9908913467794405

In [58]:
# the classifier seems to be overly predict 1, capturing most 1, but almost half of it was misclassified

In [61]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [62]:
text_clf = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
)

In [78]:
X = df.t

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [80]:
cross_val_score(text_clf, X_train, y_train, cv=5)

array([0.50592885, 0.53754941, 0.53557312, 0.5513834 , 0.52277228])

In [91]:


text_nb_clf = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    DenseTransformer(),
    GaussianNB()
)

In [102]:
cross_val_score(text_nb_clf, X_train, y_train, cv=5)

array([0.47035573, 0.46640316, 0.46837945, 0.48418972, 0.52079208])

In [104]:
from sklearn.svm import SVC

text_svm = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    SVC(random_state=42)
)

In [105]:
cross_val_score(text_svm, X_train, y_train, cv=5)



array([0.54743083, 0.54743083, 0.54743083, 0.54743083, 0.54851485])

In [111]:
from sklearn.metrics import classification_report

def evaluate_model(model, X=X_train, y=y_train):
    text_svm = make_pipeline(
        CountVectorizer(),
        TfidfTransformer(),
        DenseTransformer(),
        model
    )
    grid = GridSearchCV(text_svm, {}, cv=5, n_jobs=-1)
    grid.fit(X,y)
    return grid

In [115]:
grid = evaluate_model(SVC(random_state=42))



In [117]:
grid.cv_results_



{'mean_fit_time': array([16.94346161]),
 'std_fit_time': array([1.56041995]),
 'mean_score_time': array([4.33206258]),
 'std_score_time': array([0.06780955]),
 'params': [{}],
 'split0_test_score': array([0.54743083]),
 'split1_test_score': array([0.54743083]),
 'split2_test_score': array([0.54743083]),
 'split3_test_score': array([0.54743083]),
 'split4_test_score': array([0.54851485]),
 'mean_test_score': array([0.54764729]),
 'std_test_score': array([0.00043335]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([0.54770143]),
 'split1_train_score': array([0.54770143]),
 'split2_train_score': array([0.54770143]),
 'split3_train_score': array([0.54770143]),
 'split4_train_score': array([0.54743083]),
 'mean_train_score': array([0.54764731]),
 'std_train_score': array([0.00010824])}