In [None]:
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
count_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

In [None]:
data = pd.read_pickle('/content/gdrive/MyDrive/Data/dataset/data_processed_lite.pkl')
data_cleaned = data['FEATURE'].values
label = data['LABEL'].values

In [None]:
data_cleaned = count_vectorizer.fit_transform(data_cleaned)

In [None]:
test_ratio=0.1
random_st=40

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data_cleaned, label, test_size=test_ratio, random_state=random_st)

In [None]:
clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', n_jobs=-1, random_state=random_st)
clf.fit(X_train, Y_train)
y_predicted_counts = clf.predict(X_test)

In [None]:
def get_metrics(y_test, y_predicted):  
    precision = precision_score(y_test, y_predicted, pos_label=None, average='weighted')             
    recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted')
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [None]:
accuracy, precision, recall, f1 = get_metrics(Y_test, y_predicted_counts)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

In [None]:
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)

In [None]:
test_data = pd.read_pickle('data_test_pickle')
ID = pd.read_pickle('ID_test_pickle')

In [None]:
test_data = count_vectorizer.transform(test_data.values)

In [None]:
y_predicted_counts = clf.predict(test_data)

In [None]:
submission = pd.DataFrame({'PRODUCT_ID':ID, 'BROWSE_NODE_ID':y_predicted_counts})
submission = submission[['PRODUCT_ID', 'BROWSE_NODE_ID']]
submission.to_csv("submission.csv", index=False)