In [4]:
from datetime import datetime
from pathlib import Path

import pandas as pd
from cleantext import clean

import swifter

In [5]:
def preprocess_row(row, date, parent, headline):
    text = row['text']
    text = clean(text, lower=False, no_urls=True, no_emails=True, zero_digits=True)
    tokens = []
    
    if date:
        date = datetime.fromtimestamp(row['timestamp'])
        weekday = ' xxweekday_' + str(date.weekday())
        hour = ' xxhour_' + str(date.hour // 3) # reduce option space
        tokens += [weekday, hour]
    if parent:
        is_reply = ' xxreplysep xxreply_' + str(pd.isnull(row['parentid'])).lower()
        tokens.append(is_reply)
    if headline:
        headline = clean(row['headline'], lower=False, no_urls=True, no_emails=True, zero_digits=True)
        tokens.append(headline + '.')
        
    return ' '.join([text] + tokens)

In [6]:
def preprocess(IN_PATH, OUT_PATH, test=False, date=True, parent=True, headline=True):
    out_base = f"dat_{date}_par_{parent}_hea_{headline}".lower()
    files = ['train.csv', 'val.csv', 'test.csv'] if test else ['train.csv', 'val.csv']
    for f in files:
        df = pd.read_csv(IN_PATH/f)
        df['text_proc'] = df.swifter.apply(lambda x: preprocess_row(x, date=date, parent=parent,headline=headline), axis=1)
        ! mkdir -p {OUT_PATH/out_base}
        df.to_csv(OUT_PATH/out_base/f)

In [7]:
hea = False
dat = False
par = True

In [8]:
IN = Path('/mnt/data/group07/johannes/ynacc_proc/replicate/split/')
OUT = Path('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/cls/')
preprocess(IN, OUT, headline=hea, date=dat, parent=par, test=True)

Pandas Apply: 100%|██████████| 7910/7910 [00:06<00:00, 1167.65it/s]
Pandas Apply: 100%|██████████| 583/583 [00:00<00:00, 1197.19it/s]
Pandas Apply: 100%|██████████| 553/553 [00:00<00:00, 962.48it/s]


In [9]:
IN = Path('~/data/ynacc_proc/replicate/lmdata_complete')
OUT = Path('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/')
preprocess(IN, OUT, headline=hea, date=dat, parent=par)

Pandas Apply: 100%|██████████| 200000/200000 [03:06<00:00, 1072.77it/s]
Pandas Apply: 100%|██████████| 38512/38512 [00:35<00:00, 1092.52it/s]


In [3]:
df = []
for x in ['train.csv', 'val.csv']:
    df.append(pd.read_csv('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/dat_false_par_false_hea_false/' + x))
df = pd.concat(df)

In [4]:
res = df.groupby('sdid').apply(lambda x: x.sort_values(by=['commentindex']))

In [5]:
res

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,commentagreement,commentid,commentindex,constructiveclass,guid,headline,intendedaudience,parentid,persuasiveness,...,sdid,sentiment,text,thumbs-down,thumbs-up,timestamp,tone,topic,url,text_proc
sdid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
4,161346,161346,,1461615323534-9421e747-ccaa-4d2b-8819-c3c660d6...,0,,2xps6g04~E6Dy3uGsWXn,Is BlackBerry (BBRY) Stock a Solid Choice Righ...,,,,...,4,,Blackberry is lurking at such a puny low marke...,4.0,0.0,1.461615e+09,,,https://www.zacks.com/stock/news/214545/is-bla...,Blackberry is lurking at such a puny low marke...
4,161350,161350,,00002b000000000000000000000000-e944c5ab-1d67-4...,1,,2xps6g04~E6Dy3uGsWXn,Is BlackBerry (BBRY) Stock a Solid Choice Righ...,,1461615323534-9421e747-ccaa-4d2b-8819-c3c660d6...,,...,4,,This is why Watsa is propping up the Blackberr...,3.0,0.0,1.461615e+09,,,https://www.zacks.com/stock/news/214545/is-bla...,This is why Watsa is propping up the Blackberr...
4,161363,161363,,00003b000000000000000000000000-3d0d39e9-7397-4...,2,,2xps6g04~E6Dy3uGsWXn,Is BlackBerry (BBRY) Stock a Solid Choice Righ...,,1461615323534-9421e747-ccaa-4d2b-8819-c3c660d6...,,...,4,,Blackberry is a solid choice if you are a Indi...,3.0,0.0,1.461615e+09,,,https://www.zacks.com/stock/news/214545/is-bla...,Blackberry is a solid choice if you are a Indi...
4,168315,168315,,00004g000000000000000000000000-176a7f54-903f-4...,3,,sCU9pX5x7551cWnWT4ME,Is BlackBerry (BBRY) Stock a Solid Choice Righ...,,1461615323534-9421e747-ccaa-4d2b-8819-c3c660d6...,,...,4,,"Troll, its funny you have become so millennial...",1.0,2.0,1.461700e+09,,,https://www.zacks.com/stock/news/214545/is-bla...,"Troll, its funny you have become so millennial..."
4,172477,172477,,00005b000000000000000000000000-94753326-c5d7-4...,4,,2xps6g04~E6Dy3uGsWXn,Is BlackBerry (BBRY) Stock a Solid Choice Righ...,,1461615323534-9421e747-ccaa-4d2b-8819-c3c660d6...,,...,4,,LOL,2.0,0.0,1.461735e+09,,,https://www.zacks.com/stock/news/214545/is-bla...,LOL
4,178250,178250,,00006b000000000000000000000000-19beb717-075d-4...,5,,2xps6g04~E6Dy3uGsWXn,Is BlackBerry (BBRY) Stock a Solid Choice Righ...,,1461615323534-9421e747-ccaa-4d2b-8819-c3c660d6...,,...,4,,LOL at the gross Blackberry fanboys. Overgrown...,3.0,0.0,1.461788e+09,,,https://www.zacks.com/stock/news/214545/is-bla...,LOL at the gross Blackberry fanboys. Overgrown...
9,23366,23366,,1459911148793-88445c27-8f71-4dff-a2fb-a06a5519...,0,,0AbDEIZJRVmRDsdBrKjq,Altria (MO) is the Best Tobacco Stock to Buy N...,,,,...,9,,poorly written article. 100% of PM internation...,0.0,2.0,1.459911e+09,,,https://www.zacks.com/stock/news/212399/altria...,poorly written article. 000% of PM internation...
9,23883,23883,,00002n000000000000000000000000-1bc717f7-f205-4...,1,,jQZwgGp0W6qslVdkiX~8,Altria (MO) is the Best Tobacco Stock to Buy N...,,1459911148793-88445c27-8f71-4dff-a2fb-a06a5519...,,...,9,,Yes it's 1.27 a share currency. But in 2014 PM...,0.0,0.0,1.459917e+09,,,https://www.zacks.com/stock/news/212399/altria...,Yes it's 0.00 a share currency. But in 0000 PM...
9,24903,24903,,00003g000000000000000000000000-a4a0b46b-4dc3-4...,2,,0AbDEIZJRVmRDsdBrKjq,Altria (MO) is the Best Tobacco Stock to Buy N...,,1459911148793-88445c27-8f71-4dff-a2fb-a06a5519...,,...,9,,"currency will change that quickly, that and le...",0.0,0.0,1.459942e+09,,,https://www.zacks.com/stock/news/212399/altria...,"currency will change that quickly, that and le..."
21,45320,45320,,1460176786610-07bbcb0e-2284-48cb-9b88-87b33c0f...,0,,hbMAMfmw9nUIe3ECCQL1,Math Teacher Marries 18-Year-Old Former Studen...,,,,...,21,,What could a 40 year old have in common with a...,0.0,12.0,1.460177e+09,,,http://www.cosmopolitan.com/sex-love/news/a564...,What could a 00 year old have in common with a...


In [41]:
res2 = res.groupby('sdid').apply(lambda x: x['text_proc'].str.cat(sep=' '))

xx = pd.DataFrame(res2).reset_index()

xx['text_proc'] = xx[0]

xx = xx[['sdid', 'text_proc']]

! mkdir -p '/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/dat_false_par_false_hea_false_bysdid/'

xx[:30000].to_csv('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/dat_false_par_false_hea_false_bysdid/train.csv', index=False)

xx[30000:].to_csv('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/dat_false_par_false_hea_false_bysdid/val.csv', index=False)

Defaulting to column, but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


In [7]:
res2 = res.groupby('sdid').apply(lambda x: ' xx_thread_start xx_comment_start ' + x['text_proc'].str.cat(sep=' xx_comment_end xx_comment_start ') + ' xx_comment_end xx_thread_end ')

xx = pd.DataFrame(res2).reset_index()

xx['text_proc'] = xx[0]

xx = xx[['sdid', 'text_proc']]

! mkdir -p '/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/dat_false_par_false_hea_false_bysdid_sep/'

xx[:30000].to_csv('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/dat_false_par_false_hea_false_bysdid_sep/train.csv', index=False)

xx[30000:].to_csv('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/dat_false_par_false_hea_false_bysdid_sep/val.csv', index=False)

Defaulting to column, but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


In [6]:
res2 = res.groupby('sdid').apply(lambda x: x['text_proc'].str.cat(sep=' xxbos '))

xx = pd.DataFrame(res2).reset_index()

xx['text_proc'] = xx[0]

xx = xx[['sdid', 'text_proc']]

! mkdir -p '/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/sep_bos/'

xx[:30000].to_csv('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/sep_bos/train.csv', index=False)

xx[30000:].to_csv('/mnt/data/group07/johannes/ynacc_proc/proper_baseline/lm/sep_bos/val.csv', index=False)

Defaulting to column, but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.
