In [None]:
from elasticsearch import Elasticsearch
import os
from unidecode import unidecode
import string
import re
import torch
import numpy as np

es = Elasticsearch("http://localhost:9200")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

spark = SparkSession.builder \
    .appName("YourAppName") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('vietdata/vietnamese-content-cls')
tokenizer = AutoTokenizer.from_pretrained('vietdata/vietnamese-content-cls')

In [None]:
def predict_class(sentence, model, tokenizer):
    clas = ['Agriculture, food, and drink',
 'Albums',
 'Architecture',
 'Art',
 'Biology and medicine',
 'Chemistry and materials science',
 'Classical compositions',
 'Computing and engineering',
 'Earth science',
 'Film',
 'Geography',
 'Language and literature',
 'Mathematics and mathematicians',
 'Media and drama',
 'Other music articles',
 'Philosophy',
 'Physics and astronomy',
 'Places',
 'Religion',
 'Royalty, nobility, and heraldry',
 'Songs',
 'Television',
 'Transport',
 'World history',
 'Armies and military units',
 'Baseball',
 'Basketball',
 'Battles, exercises, and conflicts',
 'Culture, sociology, and psychology',
 'Economics and business',
 'Education',
 'Football',
 'Hockey',
 'Law',
 'Magazines and print journalism',
 'Military aircraft',
 'Military decorations and memorials',
 'Military people',
 'Motorsport',
 'Multi-sport event',
 'Other sports',
 'Politics and government',
 'Pro wrestling',
 'Recreation',
 'Video games',
 'Warships and naval units',
 'Weapons, equipment, and buildings']
    input_ids = torch.tensor([tokenizer.encode(sentence)])
    with torch.no_grad():
            out = model(input_ids)
            probs = out.logits.softmax(dim=-1).tolist()[0]
            top_2_indices = np.argsort(probs)[-2:]
    return clas[top_2_indices[0]], clas[top_2_indices[1]]

In [None]:
predict_class("Thủ tướng đi ngoại giao", model, tokenizer)

In [None]:
def preprocess(txt):
    # Code here
    ###########
    return predict_class(txt, model, tokenizer)

def normalize_index(txt: str)->str:
    translator = str.maketrans('', '', string.punctuation)
    # Use the translate method to remove punctuation
    rs = txt.translate(translator)
    rs = unidecode(rs).strip()
    rs = rs.replace(' ', '_')
    return rs.lower()

In [None]:
data_dir = './data/'

In [None]:
df = None
for file in os.listdir(data_dir):
    f_path = os.path.join(data_dir, file)
    if df == None:
        df = spark.read.option("encoding", "utf-8").json(f_path)
    else:
        new_df = spark.read.option("encoding", "utf-8").json(f_path)
        df = df.union(new_df)

# df.show()
rdd = df.rdd

# processed_rdd = rdd.map(preprocess)

# processed_rdd

In [None]:
rdd.collect()

In [None]:
index_list

In [23]:
index_list = es.cat.indices(format="json")

# Xóa từng index trong danh sách
for index in index_list:
    index_name = index['index']
    es.indices.delete(index=index_name)

print("All indices have been deleted.")

All indices have been deleted.


In [24]:
for x in rdd.collect():
  try:
    cate1, cate0 = predict_class(x['text'], model, tokenizer)
    data = {   
            'page': x['page'],
            'post_id': x['post_id'],
            'text': x['text'],
            'timestamp': x['timestamp'],
            'likes': int(x['likes']),
            'comments': int(x['comments']),
          }
    print(cate1, cate0)
    es.index(index=normalize_index(cate1), document=data)
    es.index(index=normalize_index(cate0), document=data)

  except:
    continue

Film Video games
Geography Earth science
Film Culture, sociology, and psychology
Philosophy Film
Film Agriculture, food, and drink
Film Television
Video games Film
Film Video games
Television Video games
Film Television
Film Agriculture, food, and drink
Film Agriculture, food, and drink
Film Video games
Video games Film
Film Albums
Philosophy Film
Video games Film
Video games Film
Film Culture, sociology, and psychology
Video games Film
Video games Film
Video games Film
Video games Film
Film Philosophy
Video games Film
Video games Film
Video games Film
Computing and engineering Culture, sociology, and psychology
Recreation Football
Film Video games
Television Film
Television Media and drama
Film Other music articles
Education Culture, sociology, and psychology
Culture, sociology, and psychology Art
Video games Film
Hockey Football
Video games Film
Film Culture, sociology, and psychology
Film Pro wrestling
Transport Football
Video games Film
Video games Film
Film Video games
Film Cultur