# Clickbait Challenge: Merge titles

<code>  Fields in instances.jsonl:
  { 
    "id": "&lt;instance id&gt;",
    "postTimestamp": "&lt;weekday&gt; &lt;month&gt; &lt;day&gt; &lt;hour&gt;:&lt;minute&gt;:&lt;second&gt; &lt;time_offset&gt; &lt;year&gt;",
    "postText": ["&lt;text of the post with links removed&gt;"],
    "postMedia": ["&lt;path to a file in the media archive&gt;"],
    "targetTitle": "&lt;title of target article&gt;",
    "targetDescription": "&lt;description tag of target article&gt;",
    "targetKeywords": "&lt;keywords tag of target article&gt;",
    "targetParagraphs": ["&lt;text of the ith paragraph in the target article&gt;"],
    "targetCaptions": ["&lt;caption of the ith image in the target article&gt;"]
  }

  Fields in truth.jsonl:
  {
    "id": "&lt;instance id&gt;",
    "truthJudgments": [&lt;number in [0,1]&gt;],
    "truthMean": &lt;number in [0,1]&gt;,
    "truthMedian": &lt;number in [0,1]&gt;,
    "truthMode": &lt;number in [0,1]&gt;,
    "truthClass": "clickbait | no-clickbait"
  }


  Fields your classifier should write into a results.jsonl file
  {
    "id": "&lt;instance id&gt;",
    "clickbaitScore": &lt;number in [0,1]&gt;
  }    
</code>

In [1]:
import json
import os
import pandas as pd

from pandas.io.json import json_normalize
from pprint import pprint

## Set input folder

In [2]:
data_in = '../data/00_raw/[2017] Clickbait Challenge/data'

pprint(sorted(os.listdir(data_in)))

['clck.csv',
 'clck_titles.csv',
 'clck_titles_edit.csv',
 'clickbait17-dataset-schema.txt',
 'clickbait17-train-170331',
 'clickbait17-train-170331.zip',
 'clickbait17-train-170630.zip',
 'clickbait17-validation-170630',
 'news.csv',
 'news_titles.csv',
 'news_titles_edit.csv']


## Functions

In [3]:
def jsonl_to_df(file_path):
    items = []
    
    with open(file_path, 'rb') as f:
        for each_line in f:
            each_item = json.loads(each_line.decode('utf-8'))
            items.append(each_item)
    
    df = pd.DataFrame(items)
    df = df.set_index('id')
            
    return df

## Read & merge data

In [4]:
train_insts = jsonl_to_df(f'{data_in}/clickbait17-train-170331/instances.jsonl')
train_truth = jsonl_to_df(f'{data_in}/clickbait17-train-170331/truth.jsonl')

valid_insts = jsonl_to_df(f'{data_in}/clickbait17-validation-170630/instances.jsonl')
valid_truth = jsonl_to_df(f'{data_in}/clickbait17-validation-170630/truth.jsonl')

In [5]:
train_df = train_insts.join(train_truth)
train_df

Unnamed: 0_level_0,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,truthClass,truthJudgments,truthMean,truthMedian,truthMode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
608310377143799810,[],[Apple's iOS 9 'App thinning' feature will giv...,Tue Jun 09 16:31:10 +0000 2015,['App thinning' will be supported on Apple's i...,'App thinning' will be supported on Apple's iO...,"Apple,gives,gigabytes,iOS,9,app,thinning,featu...",[Paying for a 64GB phone only to discover that...,Apple gives back gigabytes: iOS 9 'app thinnin...,no-clickbait,"[0.0, 0.6666667, 0.0, 0.33333334, 0.0]",0.200000,0.000000,0.000000
609297109095972864,[media/609297109095972864.jpg],[RT @kenbrown12: Emerging market investors are...,Fri Jun 12 09:52:05 +0000 2015,"[Stocks Fall as Investors Watch Central Banks,...",Global investors have yanked $9.3 billion from...,"emerging market,emerging markets,em flows,em i...","[Emerging markets are out of favor., Global in...",Emerging Markets Suffer Largest Outflow in Sev...,no-clickbait,"[0.6666667, 0.0, 0.0, 0.0, 0.0]",0.133333,0.000000,0.000000
609504474621612032,[],[U.S. Soccer should start answering tough ques...,Fri Jun 12 23:36:05 +0000 2015,[US to vote for Ali in FIFA election and not B...,A U.S. Senator's scathing letter questioned U....,,"[WINNIPEG, Manitoba – The bubble U.S. Soccer i...",U.S. Soccer should start answering tough quest...,clickbait,"[0.33333334, 0.6666667, 1.0, 0.0, 0.6666667]",0.533333,0.666667,0.666667
609748367049105409,[],[How theme parks like Disney World left the mi...,Sat Jun 13 15:45:13 +0000 2015,"[Some 1,000 persons turned out in Albuquerque,...","America's top family vacation spots, like the ...","disney, disney world, disney ticket prices, di...",[When Walt Disney World opened in an Orlando s...,How theme parks like Disney World left the mid...,no-clickbait,"[1.0, 0.0, 0.33333334, 0.33333334, 0.6666667]",0.466667,0.333333,0.333333
608688782821453825,[media/608688782821453825.jpg],[Could light bulbs hurt your health? One compa...,Wed Jun 10 17:34:49 +0000 2015,[Electric lights have made the world safer and...,One company will put a health notice on all th...,"health, Should there be warning labels on your...",[(CNN)The light bulb always makes the world's ...,Warning labels on your light bulbs,clickbait,"[1.0, 0.33333334, 0.6666667, 0.33333334, 1.0]",0.666667,0.666667,1.000000
609551038983475201,[media/609551038983475201.png],[13 classic ’00s songs that were actually mean...,Sat Jun 13 02:41:07 +0000 2015,[],,,[One artist’s trash is another’s No. 1 single....,13 Classic ’00s Songs That Were Actually Meant...,clickbait,"[0.33333334, 0.6666667, 1.0, 0.6666667, 1.0]",0.733333,0.666667,1.000000
609447408955719681,[media/609447408955719681.jpg],[Dez Bryant is reportedly considering skipping...,Fri Jun 12 19:49:19 +0000 2015,[],"Wide receiver Dez Bryant has signed a new,...","Football, NFL, NFC East, Dallas Cowboys, Dez B...","[Wide receiver Dez Bryant has signed a new, lo...","Dez Bryant Contract: Latest News, Rumors, Spec...",no-clickbait,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.000000,0.000000,0.000000
609027430624288769,[],[Pregnant mother of 12 accused of keeping kids...,Thu Jun 11 16:00:29 +0000 2015,[],A pregnant mother of 12 is due in court on chi...,,"[TULSA, Okla. – A pregnant mother of 12 is du...",Pregnant mother of 12 accused of keeping kids ...,no-clickbait,"[0.33333334, 0.0, 0.33333334, 0.33333334, 0.0]",0.200000,0.333333,0.333333
608229011572068352,[],[RT @fionamatthias: 10 ways the expat life Is ...,Tue Jun 09 11:07:51 +0000 2015,[Scotland to Seek Second Independence Referend...,There’s no autopilot when you're an expat livi...,"adventure,Alienation,Bangkok,Culture,Culture S...",[There’s no autopilot when you live abroad. No...,10 Ways the Expat Life Is Like a Continual Esp...,no-clickbait,"[1.0, 0.33333334, 0.33333334, 1.0, 0.33333334]",0.600000,0.333333,0.333333
609046214554755073,[media/609046214554755073.jpg],"[House #GOP plans two days of debate, Friday s...",Thu Jun 11 17:15:07 +0000 2015,"[Obama, Chairman Paul Ryan and Republicans act...",House Republican leaders have planned for a tw...,,[House Republican leaders have planned for a t...,"House GOP plans two days of debate, Friday sho...",no-clickbait,"[0.33333334, 0.0, 0.33333334, 0.0, 0.0]",0.133333,0.000000,0.000000


In [6]:
valid_df = valid_insts.join(valid_truth)
valid_df

Unnamed: 0_level_0,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,truthClass,truthJudgments,truthMean,truthMedian,truthMode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
858462320779026433,[],[UK’s response to modern slavery leaving victi...,Sat Apr 29 23:25:41 +0000 2017,[modern-slavery-rex.jpg],“Inexcusable” failures in the UK’s system for ...,"modern slavery, Department For Work And Pensio...",[Thousands of modern slavery victims have not ...,‘Inexcusable’ failures in UK’s response to mod...,no-clickbait,"[0.3333333333, 0.0, 0.3333333333, 0.0, 0.0]",0.133333,0.000000,0.000000
858421020331560960,[],[this is good],Sat Apr 29 20:41:34 +0000 2017,"[In this July 1, 2010 file photo, Dr. Charmain...",President Donald Trump has appointed pro-life ...,"Americans United for Life, Dr. Charmaine Yoest...",[President Donald Trump has appointed the pro-...,Donald Trump Appoints Pro-Life Advocate as Ass...,clickbait,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.000000,1.000000,1.000000
858368123753435136,[],"[The ""forgotten"" Trump roast: Relive his bruta...",Sat Apr 29 17:11:23 +0000 2017,[President Trump will not attend this year's W...,President Trump won't be at this year's White ...,"trump whcd, whcd, white house correspondents d...",[When the White House correspondents’ dinner i...,The ‘forgotten’ Trump roast: Relive his brutal...,no-clickbait,"[0.3333333333, 1.0, 0.3333333333, 0.0, 0.66666...",0.466667,0.333333,0.333333
858323428260139008,[],[Meet the happiest #dog in the world!],Sat Apr 29 14:13:46 +0000 2017,"[Maru , Maru, Maru, Maru, Maru]","The article is about Maru, a husky dog who has...","Maru, husky, dogs, pandas, furball, instagram",[Adorable is probably an understatement. This ...,"Meet The Happiest Dog In The World, Maru The H...",clickbait,"[1.0, 0.6666666666, 1.0, 1.0, 1.0]",0.933333,1.000000,1.000000
858283602626347008,[],[Tokyo's subway is shut down amid fears over a...,Sat Apr 29 11:35:31 +0000 2017,[All nine lines of Tokyo's subway system were ...,"The temporary suspension, which lasted ten min...","Tokyo,subway,shut,fears,North,Korean,attack",[One of Tokyo's major subways systems says it ...,Tokyo's subway is shut down amid fears over an...,no-clickbait,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.000000,0.000000,0.000000
858224473597779969,[],[Ban lifted on Madrid doping laboratory],Sat Apr 29 07:40:34 +0000 2017,"[Samples in an anti-doping laboratory, Anthony...",Madrid's Anti-Doping Laboratory has its suspen...,,"[Share this with, Madrid's Anti-Doping Laborat...",World Anti-Doping Agency lifts ban on Madrid l...,no-clickbait,"[0.0, 0.3333333333, 0.0, 0.0, 0.0]",0.066667,0.000000,0.000000
858222698367000577,[],"[Despite the ‘Yuck Factor,’ Leeches Are Big in...",Sat Apr 29 07:33:30 +0000 2017,"[The New York Times, Basic, All Access, Home D...",Russians use about 10 million leeches a year t...,"Drugs (Pharmaceuticals),Medical Devices,Russia...",[MOSCOW — They are small as physician assistan...,"Despite the ‘Yuck Factor,’ Leeches Are Big in ...",no-clickbait,"[1.0, 0.0, 0.3333333333, 0.0, 0.3333333333]",0.333333,0.333333,0.333333
858191667739987968,[],[#China and #Pakistan have cemented their poli...,Sat Apr 29 05:30:12 +0000 2017,[],China And Pakistan Use Biryani To Take Their F...,"Pakistan, China, Biryani, CPEC, china pakistan...",[China and Pakistan have cemented their politi...,China And Pakistan Use Biryani To Take Their F...,no-clickbait,"[0.0, 0.0, 0.0, 0.3333333333, 0.0]",0.066667,0.000000,0.000000
858153572571197440,[media/photo_858153570478305281.jpg],"[Malls are dying, but it's hard to profit from...",Sat Apr 29 02:58:50 +0000 2017,[],"The wager worked early on, but time is working...","Infrastructure,Debt,Real Estate,Fund Manager,W...",[Lisa Abramowicz is a Bloomberg Gadfly columni...,This Big Short Threatens to Be a Big Bust,no-clickbait,"[0.3333333333, 0.0, 0.6666666666, 0.3333333333...",0.333333,0.333333,0.333333
858150052782645249,[],[Filipino troops kill notorious Abu Sayyaf kid...,Sat Apr 29 02:44:50 +0000 2017,[],Philippine marines have killed an Abu Sayyaf e...,,"[MANILA, Philippines – Philippine marines hav...",Filipino troops kill notorious Abu Sayyaf kidn...,no-clickbait,"[0.0, 0.0, 0.0, 0.6666666666, 0.0]",0.133333,0.000000,0.000000


## Split between news and clickbaits

### clickbait17-train-170331

In [7]:
set(train_df['truthClass'])

{'clickbait', 'no-clickbait'}

In [8]:
train_df_clck = train_df[train_df['truthClass'] == 'clickbait']
train_df_clck.shape

(762, 13)

In [9]:
train_df_news = train_df[train_df['truthClass'] == 'no-clickbait']
train_df_news.shape

(1697, 13)

### clickbait17-validation-170630

In [10]:
set(valid_df['truthClass'])

{'clickbait', 'no-clickbait'}

In [11]:
valid_df_clck = valid_df[valid_df['truthClass'] == 'clickbait']
valid_df_clck.shape

(4761, 13)

In [12]:
valid_df_news = valid_df[valid_df['truthClass'] == 'no-clickbait']
valid_df_news.shape

(14777, 13)

## Merge clickbaits/no-clickaits

In [13]:
clck_df = train_df_clck.append(valid_df_clck)
clck_df['targetTitle'] = clck_df['targetTitle'].str.strip()

print(len(clck_df['targetTitle']))

5523


In [14]:
news_df = train_df_news.append(valid_df_news)
news_df['targetTitle'] = news_df['targetTitle'].str.strip()

print(len(news_df['targetTitle']))

16474


## Output data to csv

In [15]:
clck_df.to_csv(f'{data_in}/clck.csv')
clck_df['targetTitle'].to_csv(f'{data_in}/00_cc2017_clck.csv',
                              index=None, header=None)

In [16]:
news_df.to_csv(f'{data_in}/news.csv')
news_df['targetTitle'].to_csv(f'{data_in}/00_cc2017_news.csv',
                              index=None, header=None)