In [1]:
import mlflow
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import fbeta_score, accuracy_score, precision_score, recall_score
import glob

In [2]:
input_data_path ="data/*.csv"
holdout_data_path = "data/holdout/*.csv"
train_on_holdout = False
input_columns = ["first_name", "last_name", "email"]
test_size = 0.2
experiment_id = 3421272196413697
random_state = 42
betas = [1, 1/2, 1/8]

In [3]:
mlflow.set_tracking_uri("databricks")

In [4]:
mlflow.sklearn.autolog(log_input_examples=True)

In [5]:
def read_data(path, input_cols):
    dfs = []
    for fpath in glob.glob(path):
        temp_df = pd.read_csv(fpath).fillna("")[input_cols].drop_duplicates()
        temp_df["source"] = fpath.split("/")[-1].split(".")[0]
        if "real" in fpath:
            temp_df["is_fake"] = 0
        else:
            temp_df["is_fake"] = 1
        dfs.append(temp_df)
    df = pd.concat(dfs)
    del temp_df
    del dfs
    return df

In [6]:
input_df = read_data(input_data_path, input_columns)

In [8]:
pd.crosstab(input_df["source"], input_df["is_fake"])

is_fake,0,1
source,Unnamed: 1_level_1,Unnamed: 2_level_1
innapropiate_spam_fake_users,0,250000
real_fake_users,500000,0
spam_fake_users,0,250000


In [9]:
holdout_df = read_data(holdout_data_path, input_columns)

In [11]:
pd.crosstab(holdout_df["source"], holdout_df["is_fake"])

is_fake,0,1
source,Unnamed: 1_level_1,Unnamed: 2_level_1
generated_real_like,270,0
manual_labeled_data_ch,0,270


In [12]:
with mlflow.start_run(experiment_id=experiment_id) as run:
    if train_on_holdout:
        del input_df
        input_df = holdout_df
    mlflow.log_param("train_on_holdout", train_on_holdout)
    
    print("Preparing traing and test set...")
    y = input_df.is_fake
    X = input_df.drop(['is_fake'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    
    train_source = X_train.source
    X_train = X_train.drop(['source'], axis=1)
    
    test_source = X_test.source
    X_test = X_test.drop(['source'], axis=1)
    
    mlflow.log_param("input_data_path", input_data_path)
    mlflow.log_param("holdout_data_path", holdout_data_path)
    mlflow.log_param("test_size", test_size)
    
    vectorizer_params = {
        "encoding":"utf-8",
        "strip_accents":None,
        "lowercase":False,
        "analyzer":"char",
        "stop_words":None,
        "max_df":1.0,
        "min_df":0.1,
        "max_features":50,
        "use_idf":True
    }
    
    ct = ColumnTransformer([("first_name", TfidfVectorizer(**vectorizer_params, ngram_range=(1,2)), "first_name"),
                            ("last_name", TfidfVectorizer(**vectorizer_params, ngram_range=(1,2)), "last_name"),
                            ("email", TfidfVectorizer(**vectorizer_params, ngram_range=(1,3)), "email")
                           ],
                         remainder="passthrough")
    print("Fitting pipeline...")
    pipe = Pipeline([('prep', ct), ('clf', MultinomialNB())])
        
    pipe.fit(X_train, y_train)
    
    print("Evaluating pipeline...")
    y_pred = pipe.predict(X_test)
    
    mlflow.log_metric("accuracy_test", accuracy_score(y_test, y_pred))
    mlflow.log_metric("precision_test", precision_score(y_test, y_pred))
    mlflow.log_metric("recall_test", recall_score(y_test, y_pred))
    
    if not train_on_holdout:
        holdout_y_pred = pipe.predict(holdout_df.drop(["is_fake", "source"], axis=1))
        mlflow.log_metric("accuracy_holdout", accuracy_score(holdout_df.is_fake, holdout_y_pred))
        mlflow.log_metric("precision_holdout", precision_score(holdout_df.is_fake, holdout_y_pred))
        mlflow.log_metric("recall_holdout", recall_score(holdout_df.is_fake, holdout_y_pred))
    
    
    for b in betas:
        mlflow.log_metric("f_beta_test_{}".format(b), fbeta_score(y_test, y_pred, beta=b))
        if not train_on_holdout:
            mlflow.log_metric("f_beta_holdout_{}".format(b), fbeta_score(holdout_df.is_fake, holdout_y_pred, beta=b))
    
    
    # mlflow.shap.log_explanation(pipe.predict, X_train)
    X_test.loc[:,"is_fake"] = y_test
    X_test.loc[:,"is_fake_pred"] = y_pred
    X_test.loc[:, "source"] = test_source
    X_test.to_csv("test_data.csv", index=False)
    mlflow.log_artifact("test_data.csv")
    
    if not train_on_holdout:
        holdout_df.loc[:, "is_fake_pred"] = holdout_y_pred
        holdout_df.to_csv("holdout_data.csv", index=False)
        mlflow.log_artifact("holdout_data.csv")
    print("Finished.")
    

Preparing traing and test set...


                  transformers=[('first_name',
                                 TfidfVectorizer(analyzer='char',
                                                 lowercase=False,
               ...`
                  transformers=[('first_name',
                                 TfidfVectorizer(analyzer='char',
                                                 lowercase=False,
                         ...`
                ngram_range=(1, 2)), 'first_name'), ('last_name', TfidfVectorizer(analyzer='char', lowercase=False, max_features=50, min_df=0.1,
       ...`


Fitting pipeline...
Evaluating pipeline...
Finished.


In [13]:
if train_on_holdout:
    demo_df = X_test
else:
    demo_df = holdout_df

In [16]:
# Improvements
# - log more metrics
# - start and end characters for first name and last name
# - play with tfidf parameters
# - add shap or feature probabilities
