In [80]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline



from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.externals import joblib


In [63]:
os.chdir(os.path.expanduser("~/Documents/bigforecast/FeatureEngineering"))

In [64]:
data_dir = os.path.expanduser("~/Documents/financial-news-dataset/ReutersNews106521")

def parse_article(file_name):
    with open(file_name, 'r') as f:
        try:
            news = f.readlines()
        except:
            print(file_name)
            raise NotImplementedError()

    news = news[4:]  # removes the dataset header
    news = " ".join(news).replace("\n", "")
    return news
    
#print(parse_article(data_dir + "/20090106/" + "us-alcoa-idUSTRE5056SB20090106"))

In [65]:

limit = 1000
def load_articles(limit):
    news_data = []
    counter = 0
    starting_dir = os.getcwd()
    for directory in os.listdir(data_dir):
        os.chdir(data_dir + "/" + directory)
        for news_article in os.listdir(data_dir + "/" + directory):
            counter += 1
            news_data.append(parse_article(news_article))
            if counter >= limit:
                break
        else:
           continue
        break #allows the inner break to break the outer loop
    os.chdir(starting_dir)
    return news_data


In [66]:
keywords = ["oil", "exxon", "WTI", "crude", "OPEC", "BP", "Rosneft", "drill", "barrel", "price", "venezuela", "Arabia", "sanctions", "rise", "gain", "surge", "posts", "Production", "gallon", "improves", "inventory", "barrels", "mobil", "gallons", "drilling", "fracking", "gas", "energy", "gasoline", "rig"]

keywords = [keyword.lower() for keyword in keywords]

def check_title(news_article, keywords, min_matches=2):
    """
    Checks to see if the article title has atleast as many
    keyword matches as min_matches to determine if it's
    relevant to the topic of oil
    """
    with open(news_article, 'r') as f:
        title = f.readline()
    title = title.lower()
    return len(list(set(title.split(" ")) & set(keywords))) >= min_matches
    

print(check_title(data_dir + "/20090106/" + "us-alcoa-idUSTRE5056SB20090106",
                  keywords))


False


In [67]:
def make_labels(keywords, limit=1000, min_matches=1):
    labels = []
    counter = 0
    starting_dir = os.getcwd()
    for directory in os.listdir(data_dir):
        os.chdir(data_dir + "/" + directory)
        for news_article in os.listdir(data_dir + "/" + directory):
            counter += 1
            labels.append(check_title(news_article, keywords, min_matches))
            if counter >= limit:
                break
        else:
           continue
        break #allows the inner break to break the outer loop
    os.chdir(starting_dir)
    return labels

so about 3.7% of the articles are relevant to us out of the first 100,000.  We'll play with this a little more, but let's see about fitting some classification models.

In [68]:
lim = 50000
news_data = load_articles(lim)
labels = make_labels(keywords, limit = lim)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(news_data,
                                                    labels,
                                                    test_size = 0.33,
                                                    random_state = 322)

In [70]:
cv = CountVectorizer()

model = LogisticRegression()

pipe = Pipeline([("vect", cv), ("model", model)])



In [71]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [72]:
preds = pipe.predict(X_test)
acc = np.mean(preds == y_test)
print("Percent accuracy:", str(acc))

confusion_matrix(y_test, preds)

array([[30331,   675],
       [ 1064,   930]])

Percent accuracy: 0.947303030303


In [73]:
print(classification_report(y_test, preds))

             precision    recall  f1-score   support

      False       0.97      0.98      0.97     31006
       True       0.58      0.47      0.52      1994

avg / total       0.94      0.95      0.94     33000



0: Not relevant
1: Relevant
Not great.  We have almost as many false negatives as we do true negatives.
Pretty bad percision.

In [74]:
model_dir = "models/"

In [77]:
model_dir = "models/"
model_name = "article_relevance_classification.pkl"
joblib.dump(pipe, model_dir + model_name)

['models/article_relevance_classification.pkl']

In [78]:
loaded_model = joblib.load(model_dir + model_name)

In [79]:
new_preds = loaded_model.predict(X_test)
print(classification_report(y_test, new_preds))

             precision    recall  f1-score   support

      False       0.97      0.98      0.97     31006
       True       0.58      0.47      0.52      1994

avg / total       0.94      0.95      0.94     33000



In [82]:
model = GradientBoostingClassifier()
pipe = Pipeline([("vect", cv), ("model", model)])
pipe.fit(X_train, y_train)
new_preds = loaded_model.predict(X_test)
print(classification_report(y_test, new_preds))

             precision    recall  f1-score   support

      False       0.97      0.98      0.97     31006
       True       0.58      0.47      0.52      1994

avg / total       0.94      0.95      0.94     33000

