## Out.txt

In [8]:
import json
import pandas as pd
import numpy as np


CHANNEL_LIST_FILE = "data/channel_list_merged/finance_kw_10k_sub_+.csv"
CHANNEL_QUALITY_FILE = "data/channel_quality/finance_kw_10k_sub_+_filter_0-200.jsonl"

In [9]:
df = pd.read_csv(CHANNEL_LIST_FILE)

# # remove duplicates
df = df.drop_duplicates()
len(df[df['n_subs'] > 10000])

159

In [10]:
rs = []
with open(CHANNEL_QUALITY_FILE, "r") as f:
    r = f.readlines()
    for i in r:
        # rs.append(json.loads(i.replace("'", "\"").replace("False", "false").replace("True", "true")))
        rs.append(json.loads(i))

In [11]:

def clean_channel_cond(r):
    n_total = len(r['snrss'])
    n_snr_ok = (np.array(r['snrss']) > 20).sum()
    # n_ac_ok = (np.array(r['acss'])).sum()
    ac_speech_probs = []
    for ac_seg in r['acss']:
        for item in ac_seg:
            if item['label'] == 'Speech':
                ac_speech_probs.append(item['score'])
    n_ac_ok = (np.array(ac_speech_probs) > 0.9).sum()

    mean_snr = np.mean(r['snrss'])
    mean_ac = np.mean(ac_speech_probs)

    is_good = n_total > 0 and n_snr_ok / n_total > 0.1 and n_ac_ok / n_total > 0.1
    return is_good, {
        'n_total': n_total,
        'n_snr_ok': n_snr_ok,
        'n_ac_ok': n_ac_ok,
        'mean_snr': mean_snr,
        'mean_ac': mean_ac,
        'url': r['url'],
    }

r_ok = []
n_ok = []
r_not_ok = []
n_not_ok = []
for r in rs:
    is_good, info = clean_channel_cond(r)
    if is_good:
        r_ok.append(r)
        n_ok.append(info)
    else:
        r_not_ok.append(r)
        n_not_ok.append(info)

In [12]:
ok_df = df[df['url'].isin([r['url'] for r in r_ok])]
not_ok_df = df[df['url'].isin([r['url'] for r in r_not_ok])]

# drop duplicate title
ok_df = ok_df.drop_duplicates(subset=['title'])
not_ok_df = not_ok_df.drop_duplicates(subset=['title'])

pass

In [13]:
len(ok_df)

11

In [14]:
len(not_ok_df)

85

In [15]:
# join n_total, n_snr_ok, n_ac_ok
ok_df = pd.merge(ok_df, pd.DataFrame(n_ok), left_on='url', right_on='url')
not_ok_df = pd.merge(not_ok_df, pd.DataFrame(n_not_ok), left_on='url', right_on='url')

# move column 'url' to the last
cols = list(ok_df.columns)
cols.remove('url')
cols.append('url')
ok_df = ok_df[cols]
not_ok_df = not_ok_df[cols]

In [16]:
# not_ok_df order by mean snr 
not_ok_df.sort_values(by='mean_snr', ascending=False)

Unnamed: 0,title,id,n_videos,n_views,n_subs,custom_url,email,n_total,n_snr_ok,n_ac_ok,mean_snr,mean_ac,url
116,Kênh Tài Chính,UCCIAvhlD3u_sw-jQGJSOu9Q,122,4119174,100000,@kenhtaichinh,,479,474,9,62.666973,0.824798,https://www.youtube.com/channel/UCCIAvhlD3u_sw...
87,Blog Đầu Tư Coin,UC4SmVKbVy9YoH4n3h3Na11A,45,6116,19800,@blogdautucoin,,294,276,2,36.618277,0.797052,https://www.youtube.com/channel/UC4SmVKbVy9YoH...
86,Blog Đầu Tư Coin,UC4SmVKbVy9YoH4n3h3Na11A,45,6116,19800,@blogdautucoin,,294,276,2,36.618277,0.797052,https://www.youtube.com/channel/UC4SmVKbVy9YoH...
98,Ông già đầu tư - Một thế giới trading,UCfAoLSXBIc3TWq5M0gfFhBg,2607,2744906,23900,@thegioidautu,,317,237,1,35.220124,0.759960,https://www.youtube.com/channel/UCfAoLSXBIc3TW...
117,NHÀ ĐẦU TƯ TÀI CHÍNH TỰ DO,UC3wrR79zN6uXej9iFOmCLOA,832,3042198,22000,@nhadaututaichinhtudo,,256,189,0,32.133825,0.795293,https://www.youtube.com/channel/UC3wrR79zN6uXe...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22,Tài chính 24h,UCdz6hH0McH6LxVg4yeB0uvA,150,189833,11200,@taichinh24g,,47,0,0,5.017991,0.599375,https://www.youtube.com/channel/UCdz6hH0McH6Lx...
23,Tài chính 24h,UCdz6hH0McH6LxVg4yeB0uvA,150,189833,11200,@taichinh24g,,47,0,0,5.017991,0.599375,https://www.youtube.com/channel/UCdz6hH0McH6Lx...
49,Trường Đại học Kinh tế - Kỹ thuật Công nghiệp,UC41cAPLHMe5MxZyOI4wkESA,106,2700377,146000,@uneti_dkk,,178,3,0,2.804704,0.384192,https://www.youtube.com/channel/UC41cAPLHMe5Mx...
50,Trường Đại học Kinh tế - Kỹ thuật Công nghiệp,UC41cAPLHMe5MxZyOI4wkESA,106,2700377,146000,@uneti_dkk,,178,3,0,2.804704,0.384192,https://www.youtube.com/channel/UC41cAPLHMe5Mx...


In [17]:
ok_df.to_csv('tmp_16_channel_ok.csv', index=False)

In [18]:
# # move column 'url' to the last
# cols = list(ok_df.columns)
# cols.remove('url')
# cols.append('url')
# ok_df = ok_df[cols]
# not_ok_df = not_ok_df[cols]

# ok_df.to_csv('channels_10k_sub+_first_200_clean.csv', index=False)
# not_ok_df.to_csv('channels_10k_sub+_first_200_not_clean.csv', index=False)