iterative
diff --git a/‎.github/workflows/cml.yaml‎
Lines changed: 63 additions & 0 deletions b/‎.github/workflows/cml.yaml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎params.yaml‎
Lines changed: 13 additions & 0 deletions b/‎params.yaml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/evaluate.py‎
Lines changed: 112 additions & 0 deletions b/‎src/evaluate.py‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎src/featurization.py‎
Lines changed: 136 additions & 0 deletions b/‎src/featurization.py‎
Lines changed: 136 additions & 0 deletions
@@ -0,0 +1,63 @@
+name: CML Report
+on: pull_request
+jobs:
+  run:
+    runs-on: [ubuntu-latest]
+    steps:
+      - uses: iterative/setup-cml@v2
+      - uses: iterative/setup-dvc@v1
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      # Needed for https://github.com/iterative/example-repos-dev/issues/225
+      - name: Installs JSON5
+        run: npm install -g json5
+      - name: Generate metrics report
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          cml ci
+          if [ $GITHUB_REF = refs/heads/main ]; then
+            PREVIOUS_REF=HEAD~1
+          else
+            PREVIOUS_REF=main
+            git fetch origin main:main
+          fi
+
+          dvc pull eval
+          dvc plots diff $PREVIOUS_REF workspace \
+            --show-vega --targets ROC | json5 > vega.json
+          vl2svg vega.json roc.svg
+
+          dvc plots diff $PREVIOUS_REF workspace \
+            --show-vega --targets Precision-Recall | json5 > vega.json
+          vl2svg vega.json prc.svg
+
+          dvc plots diff $PREVIOUS_REF workspace \
+            --show-vega --targets Confusion-Matrix | json5 > vega.json
+          vl2svg vega.json confusion.svg
+
+          cp eval/plots/images/importance.png importance_workspace.png
+
+          git checkout $PREVIOUS_REF -- dvc.lock
+          cp eval/plots/images/importance.png importance_previous.png
+
+          dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
+
+          cat <<EOF > report.md
+          # CML Report
+          ## Plots
+          ![ROC](./roc.svg)
+          ![Precision-Recall](./prc.svg)
+          ![Confusion Matrix](./confusion.svg)
+          #### Feature Importance: ${PREVIOUS_REF}
+          ![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png)
+          #### Feature Importance: workspace
+          ![Feature Importance: workspace](./importance_workspace.png)
+
+          ## Metrics and Params
+          ### ${PREVIOUS_REF} → workspace
+          ${dvc_report}
+          EOF
+
+          cml comment create --publish --pr=false report.md
@@ -0,0 +1,13 @@
+prepare:
+  split: 0.20
+  seed: 20170428
+
+featurize:
+  max_features: 100
+  ngrams: 1
+
+train:
+  seed: 20170428
+  n_est: 50
+  min_split: 0.01
+
@@ -0,0 +1,112 @@
+import json
+import math
+import os
+import pickle
+import sys
+
+import pandas as pd
+from sklearn import metrics
+from sklearn import tree
+from dvclive import Live
+from matplotlib import pyplot as plt
+
+
+def evaluate(model, matrix, split, live, save_path):
+    """
+    Dump all evaluation metrics and plots for given datasets.
+
+    Args:
+        model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
+        matrix (scipy.sparse.csr_matrix): Input matrix.
+        split (str): Dataset name.
+        live (dvclive.Live): Dvclive instance.
+        save_path (str): Path to save the metrics.
+    """
+    labels = matrix[:, 1].toarray().astype(int)
+    x = matrix[:, 2:]
+
+    predictions_by_class = model.predict_proba(x)
+    predictions = predictions_by_class[:, 1]
+
+    # Use dvclive to log a few simple metrics...
+    avg_prec = metrics.average_precision_score(labels, predictions)
+    roc_auc = metrics.roc_auc_score(labels, predictions)
+    if not live.summary:
+        live.summary = {"avg_prec": {}, "roc_auc": {}}
+    live.summary["avg_prec"][split] = avg_prec
+    live.summary["roc_auc"][split] = roc_auc
+
+    # ... and plots...
+    # ... like an roc plot...
+    live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
+    # ... and precision recall plot...
+    # ... which passes `drop_intermediate=True` to the sklearn method...
+    live.log_sklearn_plot(
+        "precision_recall",
+        labels,
+        predictions,
+        name=f"prc/{split}",
+        drop_intermediate=True,
+    )
+    # ... and confusion matrix plot
+    live.log_sklearn_plot(
+        "confusion_matrix",
+        labels.squeeze(),
+        predictions_by_class.argmax(-1),
+        name=f"cm/{split}",
+    )
+
+
+def save_importance_plot(live, model, feature_names):
+    """
+    Save feature importance plot.
+
+    Args:
+        live (dvclive.Live): DVCLive instance.
+        model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
+        feature_names (list): List of feature names.
+    """
+    fig, axes = plt.subplots(dpi=100)
+    fig.subplots_adjust(bottom=0.2, top=0.95)
+    axes.set_ylabel("Mean decrease in impurity")
+
+    importances = model.feature_importances_
+    forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
+    forest_importances.plot.bar(ax=axes)
+
+    live.log_image("importance.png", fig)
+
+
+def main():
+    EVAL_PATH = "eval"
+
+    if len(sys.argv) != 3:
+        sys.stderr.write("Arguments error. Usage:\n")
+        sys.stderr.write("\tpython evaluate.py model features\n")
+        sys.exit(1)
+
+    model_file = sys.argv[1]
+    train_file = os.path.join(sys.argv[2], "train.pkl")
+    test_file = os.path.join(sys.argv[2], "test.pkl")
+
+    # Load model and data.
+    with open(model_file, "rb") as fd:
+        model = pickle.load(fd)
+
+    with open(train_file, "rb") as fd:
+        train, feature_names = pickle.load(fd)
+
+    with open(test_file, "rb") as fd:
+        test, _ = pickle.load(fd)
+
+    # Evaluate train and test datasets.
+    with Live(EVAL_PATH) as live:
+        evaluate(model, train, "train", live, save_path=EVAL_PATH)
+        evaluate(model, test, "test", live, save_path=EVAL_PATH)
+
+        # Dump feature importance plot.
+        save_importance_plot(live, model, feature_names)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,136 @@
+import os
+import pickle
+import sys
+
+import numpy as np
+import pandas as pd
+import scipy.sparse as sparse
+import yaml
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+
+
+def get_df(data):
+    """Read the input data file and return a data frame."""
+    df = pd.read_csv(
+        data,
+        encoding="utf-8",
+        header=None,
+        delimiter="\t",
+        names=["id", "label", "text"],
+    )
+    sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
+    return df
+
+
+def save_matrix(df, matrix, names, output):
+    """
+    Save the matrix to a pickle file.
+
+    Args:
+        df (pandas.DataFrame): Input data frame.
+        matrix (scipy.sparse.csr_matrix): Input matrix.
+        names (list): List of feature names.
+        output (str): Output file name.
+    """
+    id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
+    label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
+
+    result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
+
+    msg = "The output matrix {} size is {} and data type is {}\n"
+    sys.stderr.write(msg.format(output, result.shape, result.dtype))
+
+    with open(output, "wb") as fd:
+        pickle.dump((result, names), fd)
+    pass
+
+
+def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
+    """
+    Generate train feature matrix.
+
+    Args:
+        train_input (str): Train input file name.
+        train_output (str): Train output file name.
+        bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
+        tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
+    """
+    df_train = get_df(train_input)
+    train_words = np.array(df_train.text.str.lower().values)
+
+    bag_of_words.fit(train_words)
+
+    train_words_binary_matrix = bag_of_words.transform(train_words)
+    feature_names = bag_of_words.get_feature_names_out()
+
+    tfidf.fit(train_words_binary_matrix)
+    train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
+
+    save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
+
+
+def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
+    """
+    Generate test feature matrix.
+
+    Args:
+        test_input (str): Test input file name.
+        test_output (str): Test output file name.
+        bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
+        tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
+    """
+    df_test = get_df(test_input)
+    test_words = np.array(df_test.text.str.lower().values)
+
+    test_words_binary_matrix = bag_of_words.transform(test_words)
+    test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
+    feature_names = bag_of_words.get_feature_names_out()
+
+    save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
+
+
+def main():
+    params = yaml.safe_load(open("params.yaml"))["featurize"]
+
+    np.set_printoptions(suppress=True)
+
+    if len(sys.argv) != 3 and len(sys.argv) != 5:
+        sys.stderr.write("Arguments error. Usage:\n")
+        sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
+        sys.exit(1)
+
+    in_path = sys.argv[1]
+    out_path = sys.argv[2]
+
+    train_input = os.path.join(in_path, "train.tsv")
+    test_input = os.path.join(in_path, "test.tsv")
+    train_output = os.path.join(out_path, "train.pkl")
+    test_output = os.path.join(out_path, "test.pkl")
+
+    max_features = params["max_features"]
+    ngrams = params["ngrams"]
+
+    os.makedirs(out_path, exist_ok=True)
+
+    bag_of_words = CountVectorizer(
+        stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
+    )
+    tfidf = TfidfTransformer(smooth_idf=False)
+
+    generate_and_save_train_features(
+        train_input=train_input,
+        train_output=train_output,
+        bag_of_words=bag_of_words,
+        tfidf=tfidf,
+    )
+
+    generate_and_save_test_features(
+        test_input=test_input,
+        test_output=test_output,
+        bag_of_words=bag_of_words,
+        tfidf=tfidf,
+    )
+
+
+if __name__ == "__main__":
+    main()