In [1]:
import torch
from torch import nn, optim
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import jieba.posseg as pseg
import re

In [None]:
en_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
en_model = AutoModel.from_pretrained("bert-base-cased")
zh_tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
zh_model = AutoModel.from_pretrained("bert-base-chinese")

In [None]:
df1 = pd.read_csv('./notification_sorted.csv')

r1 = u'[a-zA-Z0-9’!"#$%&\'()*+「」,-./:;<=>?°·@，。?★、…【】《》？“”‘’［］！[\\]^_`{|}~]+'
r2 = "[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+"
r3 =  "[.!//_,$&%^*()<>+\"'?@#-|:~{}]+|[——！\\\\，。=？、：“”‘’《》【】￥……（）]+" 
r4 =  "\\【.*?】+|\\《.*?》+|\\#.*?#+|[.!/_,$&%^*()<>+""'?@|:~{}#]+|[——！\\\，。=？、：“”‘’￥……（）《》【】]"

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F9D0"
                           "]+", flags=re.UNICODE)
        
droplist = list()        
for i in range(len(df1)):

    if len(str(df1.iloc[i]['appName'])) == 0 or len(str(df1.iloc[i]['category'])) == 0 or len(str(df1.iloc[i]['title'])) == 0 or len(str(df1.iloc[i]['content'])) == 0:
        droplist.append(i)
        continue

    sentence = re.sub(r4,'',str(df1.iloc[i]['content']))
    sentence = re.sub(r1,'',sentence)
    sentence = re.sub(r" ","",sentence)
    sentence = re.sub("\n","",sentence)
    sentence = re.sub("\u200b","",sentence)
    sentence = re.sub("\u200d","",sentence)
    sentence = emoji_pattern.sub(r'', sentence)
    sentence = re.sub(r"http\S+", "", sentence)

    if len(sentence) == 0:
        droplist.append(i)
        continue

    words = pseg.cut(sentence)
    sentence = ' '.join([word for word, flag in words if flag != 'x'])

    if len(sentence) == 0:
        droplist.append(i)
        continue

    df1.loc[i,'content'] = sentence

df1.drop(droplist, axis=0, inplace=True)

In [6]:
text = df1[['appName', 'category', 'title', 'content']].values.astype(str).tolist()
X = torch.stack([torch.cat((
        torch.cat(en_model(**en_tokenizer(row, return_tensors='pt', padding=True, truncation=True), output_hidden_states=True)[2][-4: ])[:, 0].detach(),
        torch.cat(zh_model(**zh_tokenizer(row, return_tensors='pt', padding=True, truncation=True), output_hidden_states=True)[2][-4: ])[:, 0].detach(),
            )) for row in tqdm(text)])
y = torch.LongTensor([6 if np.isnan(label) else int(label) for label in df1['display_order'].values.tolist()])

100%|██████████| 15483/15483 [1:57:03<00:00,  2.20it/s]  


In [7]:
print(X.shape)
print(y.shape)
torch.save(X, 'X.pt')
torch.save(y, 'y.pt')

torch.Size([15483, 32, 768])
torch.Size([15483])
