In [1]:
import pandas as pd

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
import string

In [3]:
%%time

train = pd.read_parquet('../data/data_fusion_train.parquet')
train = train[~train.category_id.isin({-1, 121})].drop_duplicates('item_name')
print(train.shape)

(48225, 9)
CPU times: user 23.9 s, sys: 3.49 s, total: 27.4 s
Wall time: 19.9 s


In [4]:
def preprocess(text):
    text = text.lower()
    text = "".join(c if c not in string.punctuation else f" {c} " for c in text )
    return " ".join(w.strip() for w in text.split())

In [5]:
%%time

train.item_name = train.item_name.apply(preprocess)

CPU times: user 419 ms, sys: 109 ms, total: 527 ms
Wall time: 474 ms


In [6]:
tfidf = CountVectorizer(ngram_range=(3,5), analyzer="char_wb")
X_train = tfidf.fit_transform(train.item_name)

In [7]:
%%time

clf = LogisticRegression(n_jobs=4)
cv_scores = cross_val_score(clf, X_train, train.category_id, cv=3, scoring='f1_weighted')
print(cv_scores)

[0.83075565 0.8140802  0.78090472]
CPU times: user 1.26 s, sys: 1.03 s, total: 2.29 s
Wall time: 15min


In [8]:
%%time

clf.fit(X_train, train.category_id)

CPU times: user 284 ms, sys: 230 ms, total: 514 ms
Wall time: 5min 50s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=4, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
import pickle
pickle.dump(tfidf, open('count_vec_baseline', 'wb'))
pickle.dump(clf, open('clf_baseline', 'wb'))

In [10]:
!zip sumb.zip count_vec_baseline clf_baseline script.py

updating: count_vec_baseline (deflated 65%)
updating: clf_baseline (deflated 29%)
updating: script.py (deflated 45%)
