In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers[sentencepiece]
!pip install datasets

In [None]:
!pip install bert_score

In [None]:
import os
import json
import numpy as np
import pandas as pd
import re

import random

from tqdm.auto import tqdm, trange

import seaborn as sns
import matplotlib.pyplot as plt
# set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
sns.set(style="darkgrid")

import logging
import transformers
from bert_score import score
# hide the loading message
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

In [None]:
file_path = '/content/drive/MyDrive/sports_lp'
comment_path = "live.json"
news_path = "news.txt"
match_path = os.listdir(file_path)

In [None]:
len(match_path)

1164

In [None]:
load_comment = json.load(open('/'.join([file_path, 'laliga_0080', comment_path]),'r'))
load_news = open('/'.join([file_path, 'laliga_0080', news_path]), "r").read()
len(load_news)

1636

In [None]:
def convert_comment(comment_data):
    
    # Convert the raw live comment data to a tuple(timeline, score, current event)
    time_status = {'上':0, '下':45}  # determine the first half or second half
    comment_lst = []
    for n in comment_data['result']['data']:
        if len(n['t']) != 0:
            if re.match(r'\d+', n['t']):
                t_i = re.match(r'\d+', n['t']).group()  # get the digit part of the time
                if len(n['s']) != 0:
                    t_i = int(t_i) + time_status[n['s']]
                else:
                    t_i = int(t_i) 
                s_i = '-'.join([n['s1'], n['s2']])
                c_i = n['m']
                com_i = (t_i, s_i, c_i)
                comment_lst.append(com_i)
    return comment_lst


def map_comment_news(comment_list, news_data):
            
    comment_df = pd.DataFrame(comment_list, columns=['Time', 'Score', 'Comment'])  # convert to dataframe for better visualization
    
    news_text = re.sub(r'<[^>]+>|[\n]', '', news_data)   # delete the HTML tag in the news
    
    # map the live comments to news sentence based on Bertscore
    mapped_lst = []
    for t in news_text.split('。'):   # Seperate each sentence
        news_time = re.search(r'\d+分钟', t)  # search for each sentence with timeline information
        start_time = re.search(r'一开场，|开场后|[\d]+秒', t)
        # half_time = re.search(r'下半时，', t)
        if news_time:
            time_i = news_time.group()
            time_i = re.search(r'\d+', time_i).group()
            if len(time_i) != 0:
                time_i = int(time_i)

                comment_rel = comment_df[(comment_df["Time"]>=time_i-1) & (comment_df["Time"]<=time_i+2)]  # to match the relavent time in live comments
                if len(comment_rel) != 0:  # to make sure the relavent comment pieces exist
                    if len(comment_rel) == 1:  # if the comment is unique, then keep it
                        comment_match = comment_rel['Comment'].iloc[0]
                    else:
                        refs = comment_rel['Comment'].tolist()
                        cands = [t] * len(refs)
                        P, R, F1 = score(cands, refs, lang="zh", rescale_with_baseline=True)  # Calculate the Bertscore
                        comment_match = refs[F1.argmax()]
                    mapped_lst.append((comment_match, t))
        
        elif start_time:
            time_i = 1
            comment_rel = comment_df[(comment_df["Time"]>=time_i-1) & (comment_df["Time"]<=time_i+1)]  # to match the relavent time in live comments
            if len(comment_rel) != 0:  # to make sure the relavent comment pieces exist
                if len(comment_rel) == 1:  # if the comment is unique, then keep it
                    comment_match = comment_rel['Comment'].iloc[0]
                else:
                    refs = comment_rel['Comment'].tolist()
                    cands = [t] * len(refs)
                    P, R, F1 = score(cands, refs, lang="zh", rescale_with_baseline=True)  # Calculate the Bertscore
                    comment_match = refs[F1.argmax()]   
                mapped_lst.append((comment_match, t))
                    
    # Construct the classification training data
    # if the comment can be used in news, label it 1, otherwise 0
    mapped_comm = [i[0] for i in mapped_lst]
    comm_arr = np.array(comment_df['Comment'])
    label_arr = np.zeros(len(comment_df))
    for i in range(len(comment_df)):
        if comm_arr[i] in mapped_comm:
            label_arr[i] = 1
    
    comm_arr = comm_arr.reshape(-1,1)
    label_arr = label_arr.reshape(-1,1)
    return np.hstack((comm_arr, label_arr)), mapped_lst

In [None]:
com_list = convert_comment(load_comment)

In [None]:
com_list

In [None]:
random.seed(10)
input_matches = random.sample(match_path, 600)
len(input_matches)

600

In [None]:
mapped_dict = {}
clf_train = np.array(['comments', 0])

for match in tqdm(input_matches):  # the 1000 matches of the total dataset
    if match != '.DS_Store':
        load_comment = json.load(open('/'.join([file_path, match, comment_path]),'r'))
        load_news = open('/'.join([file_path, match, news_path]), "r").read()
        com_list = convert_comment(load_comment)
        clf_data, map_outcome = map_comment_news(com_list, load_news)

        mapped_dict[match] = map_outcome
        clf_train = np.vstack((clf_train, clf_data))

  0%|          | 0/600 [00:00<?, ?it/s]



In [None]:
clf_train

array([['comments', '0'],
       ['可惜力量过大，直接传出了底线', 0.0],
       ['女王公园还在控球，朴智星过顶球找西塞', 0.0],
       ...,
       ['夺冠路上这场比赛曼城不容有失！！', 0.0],
       ['萨巴莱塔斜长传找米尔纳，力量太大了，米尔纳没有追到', 0.0],
       ['富勒姆开场没有任何试探，直接大脚传入曼城禁区，可惜是传大了', 0.0]], dtype=object)

In [None]:
np.sum(clf_train[1:,1])/len(mapped_dict)

12.568333333333333

In [None]:
with open('mapped_data_full.json', 'w') as fp:
    json.dump(mapped_dict, fp)

In [None]:
np.save('clf_train_full.npy', clf_train[1:,:])

In [None]:
mapped_dict