In [2]:
# !pip3 install sklearn tqdm
# !wget https://f000.backblazeb2.com/file/malay-dataset/sentiment/semisupervised/semisupervised-bert-xlnet.csv

In [3]:
import pandas as pd

df = pd.read_csv('semisupervised-bert-xlnet.csv')
df.head()

Unnamed: 0,text,label,prob
0,@LionelModric10 Sabah sabah yorma donkey,Neutral,0.999414
1,Continue to reach out . . SEKUT in aja laah Na...,Neutral,0.994295
2,ada suprise untuk #danishnaufal_14 tq pada pem...,Positive,0.999538
3,aku kerja keras gila jimat jimat nak beli apa ...,Positive,0.999405
4,@farhanalv ajak makan ah ke penang bistro wkwkw,Neutral,0.999462


In [4]:
import re

def cleaning(string):
    string = ' '.join([w for w in string.split() if w.find('@') < 0])
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = re.sub('[^A-Za-z0-9 ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

In [5]:
from tqdm import tqdm

texts, labels = [], []
for i in tqdm(range(len(df))):
    t = cleaning(df['text'].iloc[i])
    if len(t):
        texts.append(t)
        labels.append(df['label'].iloc[i])

100%|████████████████████████████████| 185787/185787 [00:10<00:00, 17403.03it/s]


In [6]:
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
model = Pipeline([
    ("tfidf_vectorizer", TfidfVectorizer(stop_words="english")),
    ("nb", ComplementNB())
])
model.fit(texts, labels)

Pipeline(steps=[('tfidf_vectorizer', TfidfVectorizer(stop_words='english')),
                ('nb', ComplementNB())])

In [8]:
model.predict_proba(texts[:10])

array([[0.18927582, 0.61000556, 0.20071862],
       [0.04540901, 0.82776806, 0.12682293],
       [0.02807277, 0.02461707, 0.94731016],
       [0.35064422, 0.14682629, 0.50252949],
       [0.15815617, 0.70677826, 0.13506558],
       [0.25874791, 0.09318693, 0.64806516],
       [0.00867239, 0.96742275, 0.02390486],
       [0.6439133 , 0.19348827, 0.16259843],
       [0.24624322, 0.25934145, 0.49441533],
       [0.09975433, 0.77750769, 0.12273798]])

In [9]:
import pickle

filename = 'tfidf-nb-malay-sentiment.pkl'
with open(filename, 'wb') as fopen:
    pickle.dump(model, fopen)

In [16]:
import pydoop.hdfs
import os

hdfs = pydoop.hdfs.hdfs(host = 'hdfs', port = 9000)

2022-03-03 13:55:23,127 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [17]:
with hdfs.open_file(os.path.join('/user', filename), 'wb') as fopen:
    with open(filename, 'rb') as fopen_pickle:
        fopen.write(fopen_pickle.read())

In [18]:
with hdfs.open_file(os.path.join('/user', filename), 'rb') as fopen:
    model_from_hdfs = pickle.loads(fopen.read())

In [23]:
str(model_from_hdfs.predict(['helo'])[0])

'Neutral'