In [8]:
import mlflow
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import fbeta_score

In [3]:
input_data_path ="data/manual_labeled_data.csv"
test_size = 0.2
experiment_id = 3421272196413697
random_state = 42
betas = [1, 1/2, 1/8]

In [4]:
mlflow.set_tracking_uri("databricks")

In [13]:
mlflow.sklearn.autolog(log_input_examples=True)

In [6]:
input_data = pd.read_csv(input_data_path).fillna("")

In [14]:
# Improvements
# - hold out set
# - more data
# - more metrics
# - start and end characters for first name and last name
# - play with tfidf parameters
# - add shap or feature probabilities


In [15]:
with mlflow.start_run(experiment_id=experiment_id) as run:
    print("Reading data from {}".format(input_data_path))
    y = input_data.is_fake
    X = input_data.drop('is_fake', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    
    mlflow.log_param("input_data_path", input_data_path)
    mlflow.log_param("test_size", test_size)
    
    vectorizer_params = {
        "encoding":"utf-8",
        "strip_accents":None,
        "lowercase":False,
        "analyzer":"char",
        "stop_words":None,
        "max_df":1.0,
        "min_df":0.1,
        "max_features":50,
        "use_idf":True
    }
    
    ct = ColumnTransformer([("first_name", TfidfVectorizer(**vectorizer_params, ngram_range=(1,2)), "first_name"),
                            ("last_name", TfidfVectorizer(**vectorizer_params, ngram_range=(1,2)), "last_name"),
                            ("email", TfidfVectorizer(**vectorizer_params, ngram_range=(1,3)), "email")
                           ],
                         remainder="passthrough")
    print("Fitting pipeline...")
    pipe = Pipeline([('prep', ct), ('clf', MultinomialNB())])
        
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    for b in betas:
        mlflow.log_metric("f_beta", fbeta_score(y_test, y_pred, beta=b))
    
    
    # mlflow.shap.log_explanation(pipe.predict, X_train)
    X_test.loc[:,"y_true"] = y_test
    X_test.loc[:,"y_pred"] = y_pred
    X_test.to_csv("test_data.csv", index=False)
    mlflow.log_artifact("test_data.csv", "test_data.csv")
    print("Finished.")
    

Reading data from data/manual_labeled_data.csv


                  transformers=[('first_name',
                                 TfidfVectorizer(analyzer='char',
                                                 lowercase=False,
               ...`
                  transformers=[('first_name',
                                 TfidfVectorizer(analyzer='char',
                                                 lowercase=False,
                         ...`
                ngram_range=(1, 2)), 'first_name'), ('last_name', TfidfVectorizer(analyzer='char', lowercase=False, max_features=50, min_df=0.1,
       ...`


Fitting pipeline...



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Finished.
