|  | 
|  | 1 | +import os | 
|  | 2 | +import pickle | 
|  | 3 | +import sys | 
|  | 4 | + | 
|  | 5 | +import numpy as np | 
|  | 6 | +import pandas as pd | 
|  | 7 | +import scipy.sparse as sparse | 
|  | 8 | +import yaml | 
|  | 9 | +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | 
|  | 10 | + | 
|  | 11 | + | 
|  | 12 | +def get_df(data): | 
|  | 13 | +    """Read the input data file and return a data frame.""" | 
|  | 14 | +    df = pd.read_csv( | 
|  | 15 | +        data, | 
|  | 16 | +        encoding="utf-8", | 
|  | 17 | +        header=None, | 
|  | 18 | +        delimiter="\t", | 
|  | 19 | +        names=["id", "label", "text"], | 
|  | 20 | +    ) | 
|  | 21 | +    sys.stderr.write(f"The input data frame {data} size is {df.shape}\n") | 
|  | 22 | +    return df | 
|  | 23 | + | 
|  | 24 | + | 
|  | 25 | +def save_matrix(df, matrix, names, output): | 
|  | 26 | +    """ | 
|  | 27 | +    Save the matrix to a pickle file. | 
|  | 28 | +
 | 
|  | 29 | +    Args: | 
|  | 30 | +        df (pandas.DataFrame): Input data frame. | 
|  | 31 | +        matrix (scipy.sparse.csr_matrix): Input matrix. | 
|  | 32 | +        names (list): List of feature names. | 
|  | 33 | +        output (str): Output file name. | 
|  | 34 | +    """ | 
|  | 35 | +    id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T | 
|  | 36 | +    label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T | 
|  | 37 | + | 
|  | 38 | +    result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr") | 
|  | 39 | + | 
|  | 40 | +    msg = "The output matrix {} size is {} and data type is {}\n" | 
|  | 41 | +    sys.stderr.write(msg.format(output, result.shape, result.dtype)) | 
|  | 42 | + | 
|  | 43 | +    with open(output, "wb") as fd: | 
|  | 44 | +        pickle.dump((result, names), fd) | 
|  | 45 | +    pass | 
|  | 46 | + | 
|  | 47 | + | 
|  | 48 | +def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf): | 
|  | 49 | +    """ | 
|  | 50 | +    Generate train feature matrix. | 
|  | 51 | +
 | 
|  | 52 | +    Args: | 
|  | 53 | +        train_input (str): Train input file name. | 
|  | 54 | +        train_output (str): Train output file name. | 
|  | 55 | +        bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words. | 
|  | 56 | +        tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer. | 
|  | 57 | +    """ | 
|  | 58 | +    df_train = get_df(train_input) | 
|  | 59 | +    train_words = np.array(df_train.text.str.lower().values) | 
|  | 60 | + | 
|  | 61 | +    bag_of_words.fit(train_words) | 
|  | 62 | + | 
|  | 63 | +    train_words_binary_matrix = bag_of_words.transform(train_words) | 
|  | 64 | +    feature_names = bag_of_words.get_feature_names_out() | 
|  | 65 | + | 
|  | 66 | +    tfidf.fit(train_words_binary_matrix) | 
|  | 67 | +    train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix) | 
|  | 68 | + | 
|  | 69 | +    save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output) | 
|  | 70 | + | 
|  | 71 | + | 
|  | 72 | +def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf): | 
|  | 73 | +    """ | 
|  | 74 | +    Generate test feature matrix. | 
|  | 75 | +
 | 
|  | 76 | +    Args: | 
|  | 77 | +        test_input (str): Test input file name. | 
|  | 78 | +        test_output (str): Test output file name. | 
|  | 79 | +        bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words. | 
|  | 80 | +        tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer. | 
|  | 81 | +    """ | 
|  | 82 | +    df_test = get_df(test_input) | 
|  | 83 | +    test_words = np.array(df_test.text.str.lower().values) | 
|  | 84 | + | 
|  | 85 | +    test_words_binary_matrix = bag_of_words.transform(test_words) | 
|  | 86 | +    test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix) | 
|  | 87 | +    feature_names = bag_of_words.get_feature_names_out() | 
|  | 88 | + | 
|  | 89 | +    save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output) | 
|  | 90 | + | 
|  | 91 | + | 
|  | 92 | +def main(): | 
|  | 93 | +    params = yaml.safe_load(open("params.yaml"))["featurize"] | 
|  | 94 | + | 
|  | 95 | +    np.set_printoptions(suppress=True) | 
|  | 96 | + | 
|  | 97 | +    if len(sys.argv) != 3 and len(sys.argv) != 5: | 
|  | 98 | +        sys.stderr.write("Arguments error. Usage:\n") | 
|  | 99 | +        sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n") | 
|  | 100 | +        sys.exit(1) | 
|  | 101 | + | 
|  | 102 | +    in_path = sys.argv[1] | 
|  | 103 | +    out_path = sys.argv[2] | 
|  | 104 | + | 
|  | 105 | +    train_input = os.path.join(in_path, "train.tsv") | 
|  | 106 | +    test_input = os.path.join(in_path, "test.tsv") | 
|  | 107 | +    train_output = os.path.join(out_path, "train.pkl") | 
|  | 108 | +    test_output = os.path.join(out_path, "test.pkl") | 
|  | 109 | + | 
|  | 110 | +    max_features = params["max_features"] | 
|  | 111 | +    ngrams = params["ngrams"] | 
|  | 112 | + | 
|  | 113 | +    os.makedirs(out_path, exist_ok=True) | 
|  | 114 | + | 
|  | 115 | +    bag_of_words = CountVectorizer( | 
|  | 116 | +        stop_words="english", max_features=max_features, ngram_range=(1, ngrams) | 
|  | 117 | +    ) | 
|  | 118 | +    tfidf = TfidfTransformer(smooth_idf=False) | 
|  | 119 | + | 
|  | 120 | +    generate_and_save_train_features( | 
|  | 121 | +        train_input=train_input, | 
|  | 122 | +        train_output=train_output, | 
|  | 123 | +        bag_of_words=bag_of_words, | 
|  | 124 | +        tfidf=tfidf, | 
|  | 125 | +    ) | 
|  | 126 | + | 
|  | 127 | +    generate_and_save_test_features( | 
|  | 128 | +        test_input=test_input, | 
|  | 129 | +        test_output=test_output, | 
|  | 130 | +        bag_of_words=bag_of_words, | 
|  | 131 | +        tfidf=tfidf, | 
|  | 132 | +    ) | 
|  | 133 | + | 
|  | 134 | + | 
|  | 135 | +if __name__ == "__main__": | 
|  | 136 | +    main() | 
0 commit comments