In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
%reload_ext autoreload

In [1]:
from os import pipe
from detecting_fake_news.data import get_local_data, get_cloud_data
from detecting_fake_news.preprocessing import TextPreprocessor
from detecting_fake_news.params import BUCKET_NAME, BUCKET_TRAIN_DATA_PATH, LOCAL_TRAIN_DATA_PATH
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import joblib
from termcolor import colored
from detecting_fake_news.gcp import storage_upload

In [2]:
### TODO:

# add MLFlow functionality?

In [3]:
class Trainer(object):
    '''
    The Trainer class fits trains, evaluates, and saves an NLP model.
    The main method is Trainer.run, which takes a dataframe as an argument.
    When instantiating an instance of the Trainer class, you provide two
    arguments, X_col and y_col, which correspond to the column names of your
    feature and label respectively.
    '''
    def __init__(self, X_col, y_col):
        self.X_col = X_col
        self.y_col = y_col
        self.pipe = None
        self.model = None

    def set_pipeline(self):
        '''resets self.pipe and self.model to None then sets self.pipe'''
        self.pipe = None
        self.model = None
        pipe = Pipeline([
            ('vectorizer', TfidfVectorizer(ngram_range=(2, 2))),
            ('nbmodel', MultinomialNB())])
        self.pipe = pipe

    def run(self, df):
        '''accepts a dataframe; preprocesses and splits data into train/test;
           fits a pipeline to X_train, y_train; evaluates on X_test, y_test;
           prints an accuracy score'''
        print("dropping rows of empty text")
        df = df.dropna(subset=[self.X_col])
        X = df[[self.X_col]]
        y = df[self.y_col]
        print("preprocessing data with following parameters:")
        preproc = TextPreprocessor()
        for k,v in vars(preproc).items():
            if v == True:
                print(f"{k}, ",end='')
        print('')
        X_clean = preproc.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(
            X_clean, y, test_size=0.25)
        print("setting pipeline")
        self.set_pipeline()
        print("vectorizing data and fitting model")
        self.model = self.pipe.fit(X_train, y_train)
        print("evaluating on test data")
        self.evaluate(X_test, y_test)

    def save_model_locally(self, model):
        '''save the model into a .joblib format'''
        joblib.dump(model, 'model.joblib')
        print(colored("model.joblib saved locally", "green"))

    def evaluate(self, X_test, y_test):
        '''predicts y_pred based on X_test and scores accuracy on y_test'''
        if self.model:
            y_pred = self.model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            print(colored(f"model accuracy: {score}", "green"))
        else:
            print("please train a model first using Trainer.run")

In [5]:
df = get_cloud_data(nrows=3000)
trainer = Trainer('text', 'label')
trainer.run(df)
trainer.save_model_locally(trainer.model)
storage_upload('models/MultinomialNB/model.joblib', 'model.joblib')

getting 3000 rows of cloud data
dropping rows of empty text
preprocessing data with following parameters:
new_line, punct, lower, accent, numbers, lemm, stop_words, 
setting pipeline
vectorizing data and fitting model
evaluating on test data
[32mmodel accuracy: 0.8664886515353805[0m
[32mmodel.joblib saved locally[0m
[32m=> model.joblib uploaded to bucket wagon-data-745-fake-news-data inside models/MultinomialNB/model.joblib[0m


In [11]:
testpre = TextPreprocessor()

In [22]:
for k,v in vars(testpre).items():
    if v == True:
        print(f"{k}, ", end='')

new_line, punct, lower, accent, numbers, lemm, stop_words, 

In [20]:
print(vars(testpre))

{'new_line': True, 'punct': True, 'lower': True, 'accent': True, 'numbers': True, 'stemm': False, 'lemm': True, 'stop_words': True}
