In [1]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import spacy as sp
import en_core_web_sm
import string

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
rdata_json = pd.read_json('relevant_news_10K.json')
irdata_json = pd.read_json('irrelevant_news_10K.json')

In [3]:
rdata_json['relevance']=1
irdata_json['relevance']=0

In [4]:
data = rdata_json.append(irdata_json, ignore_index=True)
data.head()

Unnamed: 0,content,headline,source,summary,uid,relevance
0,The lowly milkshake has turned into an unlikel...,Milkshakes become weapon of choice in UK Europ...,Agence France Presse,Former UK Independence Party leader Nigel Fara...,a437ff48-104a-54bb-bff7-c7a736158524,1
1,ANZ has moved to the front of the race that’s ...,ANZ's first assault in the looming job armageddon,News Ltd.,ANZ has moved to the front of the race that’ s...,366c92af-8143-5ffa-8702-4f26bd22c8b6,1
2,Jul. 10--Carnival Cruise Line stateroom attend...,Carnival Cruise Line to collect your used soap...,Tribune Content Agency,Jul. 10-- Carnival Cruise Line stateroom atten...,863096d4-48f0-5a7c-bee6-384a76d575ee,1
3,CHENNAI: R Rohit (5/43) and P Saravanan (5/46)...,Standard CC bags fourth title in a row [New In...,SyndiGate Media Inc.,CHENNAI: R Rohit and P Saravanan took five wic...,3e4d6490-4224-595e-be26-4cb249209b8f,1
4,Donald Trump’s nominee to lead the US Fish and...,Revealed: Trump's Wildlife Service pick has ti...,Guardian,New revelations show she also has ties to the ...,9f3e248d-b040-5058-bdc9-61c4de59f02a,1


In [5]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = en_core_web_sm.load()
stop_words = sp.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [6]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [7]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [8]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [9]:
X = data['content'] # the features we want to analyze
ylabels = data['relevance'] # the labels, or answers, we want to test against

# X_train and y_train are the entire dataset (for now)
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0)

In [10]:
# Logistic Regression Classifier
# Note: classifier is a placeholder for now
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x11d640eb8>), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngr...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [None]:
# Predicting with a test dataset
# Note: this will fail if testing data size is 0
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

In [17]:
# warning: this takes up a lot of space, but you can use the vectors within the notebook if you don't need to save.
cleaned_X_train = pipe.named_steps["cleaner"].transform(X_train)

In [32]:
cleaned_X_train_tokens = [spacy_tokenizer(cleaned_X_train[i]) for i in range(len(cleaned_X_train))]

In [None]:
np.save('cleaned_X_train', cleaned_X_test)
np.save('cleaned_X_train_tokens', cleaned_X_train_tokens)
np.save('tfidf_vector', pipe.named_steps["vectorizer"].transform(X_train))
np.save('bow_vector', bow_vector.fit_transform(cleaned_X_test))