In [None]:
!pip install transformers[sentencepiece]

In [None]:
!pip install datasets

In [72]:
import numpy as np
import pandas as pd 
import re
import os
import json

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

from transformers import AdamW, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, get_scheduler, TrainingArguments, Trainer
from datasets import Dataset

from sklearn.model_selection import KFold

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.utils import shuffle


In [4]:
data_pre = np.load("clf_train_full.npy", allow_pickle=True)
label_pre =  data_pre[:,1]

In [6]:
df = pd.DataFrame(data_pre)

In [10]:
df = df.rename(columns = {0:'text', 1:'labels'})
df['labels'] = df['labels'].astype('int64')

In [14]:
df.head()

Unnamed: 0,text,labels
0,可惜力量过大，直接传出了底线,0
1,女王公园还在控球，朴智星过顶球找西塞,0
2,补时时间已经到了，裁判随时可以吹停比赛啦,0
3,伊万诺维奇伸腿将球挡出边线,0
4,奥努哈左路拿球，面对伊万诺维奇的防守，脚跟后传球找前插的朴智星,0


In [16]:
np.sum(df['labels'])/len(df) * 100

7.501019565714741

In [17]:
print(len(df))

100533


In [20]:
data = shuffle(df, random_state=1)
train = data[:90000]
test = data[90000:]

In [21]:
train.to_csv('train_clf.csv', index=False)
test.to_csv('test_clf.csv', index=False)

In [22]:
train.head()

Unnamed: 0,text,labels
80839,切尔西边路再给,0
79345,库蒂尼奥左路拿球，禁区外围远射打在了防守球员身上,1
97333,马塔中场接球给到边路,0
14316,断球之后快速反击~,0
80036,奥斯卡中路长传球找左路插上的科尔,0


## use Bert base chinese to do classification

In [23]:

trainset = Dataset.from_pandas(train)
testset = Dataset.from_pandas(test)

In [24]:
trainset

Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 90000
})

In [25]:
checkpoint = "bert-base-chinese"  # the model has no maximum length parameter to pad with

tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_trainset = trainset.map(tokenize_function, batched=True)
tokenized_devset = testset.map(tokenize_function, batched=True)


tokenized_trainset = tokenized_trainset.remove_columns(["text"])
tokenized_devset = tokenized_devset.remove_columns(["text"])
tokenized_trainset = tokenized_trainset.remove_columns(["__index_level_0__"])
tokenized_devset = tokenized_devset.remove_columns(["__index_level_0__"])

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

  0%|          | 0/90 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [26]:
tokenized_trainset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 90000
})

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [60]:
args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-6,
    num_train_epochs=5,
    report_to="none",
    output_dir='bert_1',
    logging_steps = 200
)


trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_trainset,
    eval_dataset=tokenized_devset,
    tokenizer=tokenizer
)

PyTorch: setting up devices


In [61]:
trainer.train()

***** Running training *****
  Num examples = 90000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56250


Epoch,Training Loss,Validation Loss
1,0.1975,0.263905
2,0.1883,0.236975
3,0.2224,0.238096
4,0.2229,0.247324
5,0.1751,0.245102


***** Running Evaluation *****
  Num examples = 10533
  Batch size = 8
Saving model checkpoint to bert_1/checkpoint-11250
Configuration saved in bert_1/checkpoint-11250/config.json
Model weights saved in bert_1/checkpoint-11250/pytorch_model.bin
tokenizer config file saved in bert_1/checkpoint-11250/tokenizer_config.json
Special tokens file saved in bert_1/checkpoint-11250/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10533
  Batch size = 8
Saving model checkpoint to bert_1/checkpoint-22500
Configuration saved in bert_1/checkpoint-22500/config.json
Model weights saved in bert_1/checkpoint-22500/pytorch_model.bin
tokenizer config file saved in bert_1/checkpoint-22500/tokenizer_config.json
Special tokens file saved in bert_1/checkpoint-22500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10533
  Batch size = 8
Saving model checkpoint to bert_1/checkpoint-33750
Configuration saved in bert_1/checkpoint-33750/config.json
Model weights saved

TrainOutput(global_step=56250, training_loss=0.2204602192518446, metrics={'train_runtime': 3853.2476, 'train_samples_per_second': 116.785, 'train_steps_per_second': 14.598, 'total_flos': 8961924323262720.0, 'train_loss': 0.2204602192518446, 'epoch': 5.0})

In [62]:
predictions = trainer.predict(tokenized_devset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
print('acc:', np.sum(y_true == y_pred) / len(y_true))


***** Running Prediction *****
  Num examples = 10533
  Batch size = 8


acc: 0.9354409949681952


In [63]:
print("precision", precision_score(y_true, y_pred))

precision 0.6144366197183099


## Try with real data

In [65]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [118]:
file_path = '/content/drive/MyDrive/sports_lp'
comment_path = "live.json"
news_path = "news.txt"
match_path = os.listdir(file_path)

In [119]:
load_comment = json.load(open('/'.join([file_path, 'laliga_0158', comment_path]),'r'))
load_news = open('/'.join([file_path, 'laliga_0158', news_path]), "r").read()

In [120]:
timeline = []
comment = []
for c in load_comment['result']['data']:
    if c['t'] != '':     
        timeline.append(re.sub('\'', '', c['t']))   # delete the ' sign
        comment.append(c['m'])

In [122]:
# use the trained model to predict
match_real = pd.DataFrame([timeline, comment]).T.rename(columns = {0:'time', 1:'text'})
sampleset = Dataset.from_pandas(match_real[['text']])
tokenized_sampleset = sampleset.map(tokenize_function, batched=True)
tokenized_sampleset = tokenized_sampleset.remove_columns(["text"])

result = trainer.predict(tokenized_sampleset)
output = np.argmax(result.predictions, axis=1)
match_real['status'] = output
match_real[match_real['status'] == 1].to_csv('/content/drive/MyDrive/'+'laliga_0158'+'.csv', index=False)



  0%|          | 0/1 [00:00<?, ?ba/s]

***** Running Prediction *****
  Num examples = 121
  Batch size = 8
