Reproduce model using bigrams

iterative · Sep 4, 2019 · 72e0f12 · 72e0f12
1 parent dd2cc99
commit 72e0f12
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 7 deletions.
diff --git a/featurize.dvc b/featurize.dvc
@@ -1,13 +1,13 @@
-md5: f89c792aacc96be22aa7349f61b32506
+md5: 0262b3d2369f126f3bfe99c8b9424791
 cmd: python src/featurization.py data/prepared data/features
 wdir: .
 deps:
-- md5: e6d8262e922894e85a959816f9a77ae7
+- md5: ef70c6c0fbf4107fdab468af66333f48
   path: src/featurization.py
 - md5: 6836f797f3924fb46fcfd6b9f6aa6416.dir
   path: data/prepared
 outs:
-- md5: 3338d2c21bdb521cda0ba4add89e1cb0.dir
+- md5: 42c7025fc0edeb174069280d17add2d4.dir
   path: data/features
   cache: true
   metric: false

diff --git a/src/featurization.py b/src/featurization.py
@@ -75,7 +75,7 @@ def save_matrix(df, matrix, output):
 train_words = np.array(df_train.text.str.lower().values.astype('U'))
 
 bag_of_words = CountVectorizer(stop_words='english',
-                               max_features=5000)
+                               max_features=6000, ngram_range=(1, 2))
 bag_of_words.fit(train_words)
 train_words_binary_matrix = bag_of_words.transform(train_words)
 tfidf = TfidfTransformer(smooth_idf=False)

diff --git a/train.dvc b/train.dvc
@@ -1,13 +1,13 @@
-md5: 8277b40847044d2427217886e915bf33
+md5: bbd4e982ce8bc18d4fe17f33478207e2
 cmd: python src/train.py data/features model.pkl
 wdir: .
 deps:
 - md5: d05e0201a3fb47c878defea65bd85e4d
   path: src/train.py
-- md5: 3338d2c21bdb521cda0ba4add89e1cb0.dir
+- md5: 42c7025fc0edeb174069280d17add2d4.dir
   path: data/features
 outs:
-- md5: 43630cce66a2432dcecddc9dd006d0a7
+- md5: 662eb7f64216d9c2c1088d0a5e2c6951
   path: model.pkl
   cache: true
   metric: false