In [1]:
import pandas as pd
from pathlib import Path

try:
    from flair.models import SequenceTagger
    from flair.data import Sentence
    import spacy

except:
    print('no flair')
    
from news_utils.clean.english import clean
import swifter

In [2]:
BASE_PATH = Path('/mnt/data/datasets/ydata-ynacc-v1_0')
UNL = BASE_PATH/'ydata-ynacc-v1_0_unlabeled_conversations.tsv'
OUT = Path('~/data/ynacc_proc/replicate/lmdata_art_match')

In [5]:
def process(articles, comments, already=False):
    nlp = spacy.load('en_core_web_lg', disable=['ner'])
    tagger = SequenceTagger.load('ner-ontonotes')

    def find_ent_article(t):
        t = clean(t, lower=False)
        sents = []
        for s in nlp(t).sents:
            sents.append(Sentence(' '.join([str(t) for t in s]), use_tokenizer=True)) # use_tokenizer important because the text is not whitespace tokenized
        tagger.predict(sents, mini_batch_size=64)

        res = []
        for sent in sents:
            dic = sent.to_dict(tag_type='ner')
            if 'entities' in dic:
                for ent in dic['entities']:
                    res.append({'text': ent['text'].lower(), 'type': 'xx' + ent['type'].lower()}) # prepare for further use
        return res
    
    if not already:
        articles['ners'] = articles['text'].swifter.apply(lambda x: find_ent_article(str(x)))
    
    print('done with articles')

    comments_fin = articles.merge(comments, how="left", on="url", suffixes=('_article', ''))

    tok = spacy.blank('en')
    def preprocess_comment(row):
        ners = row['ners']
        text = row['text']
        text = str(text)
        text = clean(text, lower=False)
        tokens = tok.tokenizer(text)
        
        # preserving the casing is important!
        done = set()
        for ner in ners:
            if ' ' in ner['text']:
                continue
            if ner['text'] in done:
                continue
            done.add(ner['text'])
            tokens = [str(t) if str(t).lower() != ner['text'] else str(t) + ' ' + ner['type'] + ' xx_article_match ' for t in tokens]
        first_stage = ' '.join([str(t) for t in tokens])
        
        for ner in ners:
            if ' ' not in ner['text']:
                continue
            if ner['text'] in first_stage.lower():
                start_idx = first_stage.lower().index(ner['text'])
                end_idx = start_idx + len(ner['text'])
                fist_stage = first_stage[:end_idx] + ' ' + ner['type'] + ' xx_article_match ' + first_stage[end_idx:]
            
        return first_stage

    comments_fin['text_proc'] = comments_fin.swifter.apply(preprocess_comment)
    return comments_fin

In [6]:
articles = pd.read_csv('articles.csv')
comments = pd.read_csv(UNL, engine='python', sep='\t', quoting=3, error_bad_lines=False, usecols=['url', 'text', 'timestamp'])
comments_out = process(articles, comments)

Pandas Apply: 100%|██████████| 1853/1853 [12:28<00:00,  1.27it/s]


done with articles


Pandas Apply: 100%|██████████| 132771/132771 [03:16<00:00, 677.20it/s] 


In [7]:
pd.options.display.max_colwidth = 500

In [8]:
comments_out[comments_out['text_proc'].str.contains('article_match')][['text', 'text_proc']].shape

(29298, 2)

In [9]:
comments_out.shape

(132771, 8)

In [10]:
df_out = comments_out.sort_values(['timestamp'])
df_out['whatever'] = 1
df_out = df_out[['whatever', 'text_proc']]
df_out[:100000].to_csv(str(OUT) + '/train.csv', index=False, header=False)
df_out[100000:].to_csv(str(OUT) + '/val.csv', index=False, header=False)

In [16]:
# Do it for the classification
PATH_ARTICLES = Path('/mnt/data/group07/johannes/ynacc_proc/articles/articles_fixed_5.csv')
PATH_CL_TR = Path('~/data/ynacc_proc/replicate/split/train.csv')
PATH_CL_VA = Path('~/data/ynacc_proc/replicate/split/val.csv')

In [17]:
df_art = pd.read_csv(PATH_ARTICLES)
df_cl_tr = pd.read_csv(PATH_CL_TR)
df_cl_va = pd.read_csv(PATH_CL_VA)

In [18]:
df_art

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,url,text,title,publish_date
0,0,1,http://www.cosmopolitan.com/lifestyle/news/a56215/disneyland-paris-haunted-mansion-death/,"A Disneyland Paris employee was found dead inside the park's Phantom Manor ride on Saturday, Le Parisien reports. Investigators believe he was a maintenance officer who was fixing a faulty light fixture and accidentally electrocuted.\n\nThough he's yet to be officially identified, a Disneyland union representative told Le Parisien the worker was a 45-year-old man who ""always had a smile on his face"" and was ""very popular with his colleagues."" The park remained open after his body was found, ...",Disneyland Worker Found Dead in Haunted Mansion,2016-04-04 03:03:48
1,1,2,http://www.cosmopolitan.com/health-fitness/news/a56117/chontel-duncan-post-baby-stomach/,"Having documented her baby bump in mirror selfies throughout her whole pregnancy, it's no surprise fitness model Chontel Duncan shared a full-length selfie just five days after welcoming her baby boy. And yeah, she looks as freakin' amazing as you would expect.\n\nAdvertisement - Continue Reading Below\n\n""My body is tender, it feels like someone did 12 rounds in the boxing ring using my core as the target,"" Duncan, who delivered her son Jeremiah by C-section, wrote on Instagram. ""So much sw...",See Fitness Model Chontel Duncan's Post-Baby Belly 5 Days After Giving Birth,2016-04-01 03:17:00
2,2,3,http://www.insideedition.com/headlines/15888-english-teacher-charged-with-sexual-assault-of-a-student-youre-my-baby-boo,"Headlines Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party Playing Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party\n\n\n\n\n\n\n\nA Wisconsin high school teacher is facing charges of sexual assault following an alleged sexual relationship with a student, including on the night of her husband's bachelor's party.\n\nRead: Daycare Center Shuttered After 3-Month-Old Died On Her Mother's First Day Back at Work\n\n\n\nSara Domres, a 29...",Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party,
3,3,4,http://www.ozy.com/good-sht/the-dirtiest-soccer-player-in-the-known-universe/68373,"Because never does a man stand so tall as when he stoops to stomp on the hand of a man in need.\n\nThere’s a twisted delight to be had in watching Real Madrid’s oft-injured central defender sensation, Képler Laveran Lima Ferreira, or Pepe, do what he does best. During the 2014 FIFA World Cup, playing for Portugal against Germany, Pepe walked away, leaving Thomas Müller on the ground, a victim of the sport’s frequent and unavoidable collisions. Pepe’s heart, very possibly full of concern for ...",The Dirtiest Soccer Player in the Known Universe,
4,4,5,http://www.womansday.com/health-fitness/a54485/my-600lb-life-christina-phillips/,"Christina Phillips, who was profiled on TLC's totally non-exploitative program, My 600lb Life: Where Are They Now?, lost 537 pounds via bariatric surgery, and as a result, developed an eating disorder. The 25-year-old told her doctor that she she was living off of 400 calories a day, and he wasn't pleased.\n\nAdvertisement - Continue Reading Below\n\n""That is far below what you should be getting,"" Dr. Nowzaradan said. ""You should double that at least. You're going from one extreme to the oth...",What Happened After This Woman Lost 537 Pounds Will Shock You,2016-04-11 02:54:32
5,5,6,http://www.cosmopolitan.com/style-beauty/fashion/news/a57796/old-navy-interracial-family-ad/,"Old Navy\n\nOn Friday, Old Navy posted an ad on its Twitter feed to promote a 30 percent-off sale. They included a photo of an impossibly beautiful and stylish family wearing its latest spring styles. If you're a normal person, maybe you perked up at the thought of a sale, thought the models were adorable, or just simply scrolled right by.\n\nOh, happy day! Our #ThankYouEvent is finally here. Take 30% off your entire purchase: https://t.co/nGQ9Pji1pN pic.twitter.com/vq4mIczm6A — Old Navy Off...",Old Navy Featured an Interracial Family in an Ad and Racists Couldn't Handle It,2016-05-02 02:03:00
6,6,7,http://www.vibe.com/2016/04/texas-cop-caught-body-slamming-12-year-old-latina-girl/?utm_source=yahoo&utm_medium=syndication,"Just days ago, video footage of a Texas cop body-slamming 12-year-old middle school student Janissa Valdez was shared on YouTube. The clip, with over 2 million views now, shows the San Antonio Independent School District (SAISD) police officer with his arms tightly wrapped around the girl, before he lifts her in the air and throws her face down onto concrete floor. Student bystanders yell “oh” in unison upon impact.\n\nREAD: Chicago Teen Called 911 Three Times Before Shot By Cop\n\n“Janissa,...",Texas Cop Body-Slams 12-Year-Old Girl,2016-04-11 10:27:58+00:00
7,7,8,http://hellogiggles.com/trans-woman-selfie-bathroom-laws/,"Sarah McBride peed the other day. But it wasn’t just any trip to the bathroom. Sarah is a transgender woman, and she chose to use a women’s restroom in North Carolina. In a government building, no less.\n\nSarah’s powerful act comes after North Carolina put HB2 into effect, a law that requires bathroom goers to use the restroom of their gender at birth, consequently discriminating against many in the LGBT community.\n\nThe 25 year-old trans woman and activist is the communications manager fo...",This trans woman just posted a very important selfie to make a point about bathroom laws,2016-04-23 08:46:16-07:00
8,8,9,http://mashable.com/2016/04/16/waitress-tip-bible-verse/,"A waitress from North Carolina claims she was left a bible verse instead of a tip earlier this week.\n\nAlexandra Judd was working at Zada Janes in Charlotte, North Carolina on Tuesday, when she claims that two patrons left her a bible verse on the tip line of their bill instead of cash — followed by a note at the bottom that says ""praying for you.""\n\nThe customer wrote ""Leviticus 20:13,"" which reads, ""If a man also lie with mankind, as he lieth with a woman, both of them have committed an ...",Gay North Carolina waitress receives bible verse instead of tip,2016-04-16 00:00:00
9,9,10,http://mashable.com/2016/04/08/capybaras-hide-from-rain/,"What's cuter than one capybara hiding from the rain? Eleven capybaras waiting the storm out together under a shed!\n\nSEE ALSO: Tokyo now has a hedgehog cafe and we want to hang out there forever\n\nThe Nagasaki Bio Park in Japan tweeted a photo of their capybaras — the largest rodent in the world — taking shelter during a downpour on Thursday, and the Internet basically couldn't handle the cuteness overload.\n\nThe tweet's caption was a dialogue made-up by the park's staff that translated f...",People are losing their cool over this pic of capybaras seeking shelter from the rain,2016-04-08 00:00:00


In [19]:
df_cl_tr

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,url,article_text,title,publish_date,commentid,clcontroversial,clmean,clinformative,...,sd_agreement,sd_type,sentiment,tone,commentagreement,topic,intendedaudience,persuasiveness,text_proc,cltopic
0,0,0,http://www.cosmopolitan.com/lifestyle/news/a56215/disneyland-paris-haunted-mansion-death/,"A Disneyland Paris employee was found dead inside the park's Phantom Manor ride on Saturday, Le Parisien reports. Investigators believe he was a maintenance officer who was fixing a faulty light fixture and accidentally electrocuted.\n\nThough he's yet to be officially identified, a Disneyland union representative told Le Parisien the worker was a 45-year-old man who ""always had a smile on his face"" and was ""very popular with his colleagues."" The park remained open after his body was found, ...",Disneyland Worker Found Dead in Haunted Mansion,2016-04-04 03:03:48,00002n000000000000000000000000-1c30b878-b717-4e9a-9872-2ce2906ce783,0,0,1,...,,Positive/respectful,neutral,Informative,,Off-topic with article,Reply to a specific commenter,Not persuasive,Yes..because too many houses in EU look like the original Disney Hunted House so it didn't look scary enough. Bates Motel looks more American and that notion alone scares everyone.,0
1,1,1,http://www.cosmopolitan.com/lifestyle/news/a56215/disneyland-paris-haunted-mansion-death/,"A Disneyland Paris employee was found dead inside the park's Phantom Manor ride on Saturday, Le Parisien reports. Investigators believe he was a maintenance officer who was fixing a faulty light fixture and accidentally electrocuted.\n\nThough he's yet to be officially identified, a Disneyland union representative told Le Parisien the worker was a 45-year-old man who ""always had a smile on his face"" and was ""very popular with his colleagues."" The park remained open after his body was found, ...",Disneyland Worker Found Dead in Haunted Mansion,2016-04-04 03:03:48,00003n000000000000000000000000-ed2ae6d0-32ac-471a-b8b2-a718607ee376,0,0,0,...,,Positive/respectful,negative,,Disagreement with commenter,Off-topic with article,Reply to a specific commenter,Not persuasive,"These things happen , Every job has its dangers.",1
2,2,2,http://www.cosmopolitan.com/lifestyle/news/a56215/disneyland-paris-haunted-mansion-death/,"A Disneyland Paris employee was found dead inside the park's Phantom Manor ride on Saturday, Le Parisien reports. Investigators believe he was a maintenance officer who was fixing a faulty light fixture and accidentally electrocuted.\n\nThough he's yet to be officially identified, a Disneyland union representative told Le Parisien the worker was a 45-year-old man who ""always had a smile on his face"" and was ""very popular with his colleagues."" The park remained open after his body was found, ...",Disneyland Worker Found Dead in Haunted Mansion,2016-04-04 03:03:48,1459879464596-a3771c05-fd2e-4f44-a26a-23baec3b4249,0,0,0,...,,Positive/respectful,mixed,,,Off-topic with article,Broadcast message / general audience,Not persuasive,"Sad to hear such a bad thing. Very dangerous job working on electricity. One questions though, why did they use a picture the Bates house from Psycho, on a Disney story? Or is that what the Paris Haunted Mansion/Phantom Manor looks like?",1
3,3,3,http://www.cosmopolitan.com/health-fitness/news/a56117/chontel-duncan-post-baby-stomach/,"Having documented her baby bump in mirror selfies throughout her whole pregnancy, it's no surprise fitness model Chontel Duncan shared a full-length selfie just five days after welcoming her baby boy. And yeah, she looks as freakin' amazing as you would expect.\n\nAdvertisement - Continue Reading Below\n\n""My body is tender, it feels like someone did 12 rounds in the boxing ring using my core as the target,"" Duncan, who delivered her son Jeremiah by C-section, wrote on Instagram. ""So much sw...",See Fitness Model Chontel Duncan's Post-Baby Belly 5 Days After Giving Birth,2016-04-01 03:17:00,00002g000000000000000000000000-c08bc8b6-ecd2-4066-bbc8-eefc29eb71b6,0,0,0,...,,,neutral,,Adjunct opinion,,Reply to a specific commenter,Not persuasive,THANK YOU!!!!!!!!!!!!!!!! So annoying.. journalism today is questionable... @USERNAME,1
4,4,4,http://www.cosmopolitan.com/health-fitness/news/a56117/chontel-duncan-post-baby-stomach/,"Having documented her baby bump in mirror selfies throughout her whole pregnancy, it's no surprise fitness model Chontel Duncan shared a full-length selfie just five days after welcoming her baby boy. And yeah, she looks as freakin' amazing as you would expect.\n\nAdvertisement - Continue Reading Below\n\n""My body is tender, it feels like someone did 12 rounds in the boxing ring using my core as the target,"" Duncan, who delivered her son Jeremiah by C-section, wrote on Instagram. ""So much sw...",See Fitness Model Chontel Duncan's Post-Baby Belly 5 Days After Giving Birth,2016-04-01 03:17:00,00003g000000000000000000000000-5f0375ec-8977-4c94-88ad-a2c665c58cdc,0,0,0,...,,,neutral,,Agreement with commenter,,Reply to a specific commenter,Not persuasive,lol i thought it's my computer,1
5,5,5,http://www.cosmopolitan.com/health-fitness/news/a56117/chontel-duncan-post-baby-stomach/,"Having documented her baby bump in mirror selfies throughout her whole pregnancy, it's no surprise fitness model Chontel Duncan shared a full-length selfie just five days after welcoming her baby boy. And yeah, she looks as freakin' amazing as you would expect.\n\nAdvertisement - Continue Reading Below\n\n""My body is tender, it feels like someone did 12 rounds in the boxing ring using my core as the target,"" Duncan, who delivered her son Jeremiah by C-section, wrote on Instagram. ""So much sw...",See Fitness Model Chontel Duncan's Post-Baby Belly 5 Days After Giving Birth,2016-04-01 03:17:00,00004n000000000000000000000000-c44d1de5-af2f-42db-9412-e2476ea1bd7b,0,1,0,...,,,negative,Mean,Disagreement with commenter,,Reply to a specific commenter,Not persuasive,It's on the Cosmopolitan article you dolt.,1
6,6,6,http://www.cosmopolitan.com/health-fitness/news/a56117/chontel-duncan-post-baby-stomach/,"Having documented her baby bump in mirror selfies throughout her whole pregnancy, it's no surprise fitness model Chontel Duncan shared a full-length selfie just five days after welcoming her baby boy. And yeah, she looks as freakin' amazing as you would expect.\n\nAdvertisement - Continue Reading Below\n\n""My body is tender, it feels like someone did 12 rounds in the boxing ring using my core as the target,"" Duncan, who delivered her son Jeremiah by C-section, wrote on Instagram. ""So much sw...",See Fitness Model Chontel Duncan's Post-Baby Belly 5 Days After Giving Birth,2016-04-01 03:17:00,00005n000000000000000000000000-d0c8ecc0-0216-4a6f-974c-f0de882e699a,0,0,0,...,,,neutral,,Adjunct opinion,,Reply to a specific commenter,Not persuasive,You have to click on the link to go to Cosmo. Yahoo has been terrible about this lately.,1
7,7,7,http://www.cosmopolitan.com/health-fitness/news/a56117/chontel-duncan-post-baby-stomach/,"Having documented her baby bump in mirror selfies throughout her whole pregnancy, it's no surprise fitness model Chontel Duncan shared a full-length selfie just five days after welcoming her baby boy. And yeah, she looks as freakin' amazing as you would expect.\n\nAdvertisement - Continue Reading Below\n\n""My body is tender, it feels like someone did 12 rounds in the boxing ring using my core as the target,"" Duncan, who delivered her son Jeremiah by C-section, wrote on Instagram. ""So much sw...",See Fitness Model Chontel Duncan's Post-Baby Belly 5 Days After Giving Birth,2016-04-01 03:17:00,1459864307079-aeebbc73-f562-4540-8576-cdcebe13f099,0,0,0,...,,,neutral,,,,Broadcast message / general audience,Not persuasive,Is it just my computer or is there not a single picture of her post-baby belly on this article? I didn't even see one on her Twitter account. Good grief.,1
8,8,8,http://www.insideedition.com/headlines/15888-english-teacher-charged-with-sexual-assault-of-a-student-youre-my-baby-boo,"Headlines Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party Playing Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party\n\n\n\n\n\n\n\nA Wisconsin high school teacher is facing charges of sexual assault following an alleged sexual relationship with a student, including on the night of her husband's bachelor's party.\n\nRead: Daycare Center Shuttered After 3-Month-Old Died On Her Mother's First Day Back at Work\n\n\n\nSara Domres, a 29...",Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party,,00002b000000000000000000000000-a22824ca-7630-4ddf-ace6-8953dd5d856f,0,0,0,...,Continual disagreement,"Snarky/humorous,Flamewar (insulting)",negative,"Controversial,Mean",Disagreement with commenter,Off-topic with article,Reply to a specific commenter,Not persuasive,why is she sick --American laws are sick -- xxnumber is not a kid,0
9,9,9,http://www.insideedition.com/headlines/15888-english-teacher-charged-with-sexual-assault-of-a-student-youre-my-baby-boo,"Headlines Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party Playing Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party\n\n\n\n\n\n\n\nA Wisconsin high school teacher is facing charges of sexual assault following an alleged sexual relationship with a student, including on the night of her husband's bachelor's party.\n\nRead: Daycare Center Shuttered After 3-Month-Old Died On Her Mother's First Day Back at Work\n\n\n\nSara Domres, a 29...",Teacher Allegedly Had Sex With Student On Night of Her Husband's Bachelor Party,,00002b000000000000000000000000-d2b4e783-777d-4cc9-98e5-d2cac8b3f4ce,1,1,0,...,,Snarky/humorous,neutral,"Informative,Controversial",,,Reply to a specific commenter,Not persuasive,"Still, the woman is the legal adult and is a moron for not taking better care of herself. I agree that the whole 'victim' thing here is vague, though. For a xxnumber year old boy, banging the hot teacher is not abuse. He's still doing high fives over it.",1


In [20]:
df_cl_out_tr = process(df_art, df_cl_tr)

Pandas Apply: 100%|██████████| 670/670 [04:27<00:00,  2.52it/s]


done with articles


Pandas Apply: 100%|██████████| 7951/7951 [00:11<00:00, 691.86it/s]


In [21]:
df_cl_out_tr.to_csv(str(OUT) + '/cl_train.csv', index=False)

In [22]:
df_cl_out_va = process(df_art, df_cl_va, already=True)

done with articles


Pandas Apply: 100%|██████████| 1167/1167 [00:00<00:00, 1421.98it/s]


In [23]:
df_cl_out_va.to_csv(str(OUT) + '/cl_val.csv', index=False)

In [3]:
test_str = 'test'
test_match= 'dsfdfsdfiiytestyii'
test_idx = test_match.index(test_str) + len(test_str)
test_match[:test_idx] + ' jaja ' + test_match[test_idx:]

'dsfdfsdfiiytest jaja yii'