In [1]:
from pathlib import Path
import pickle
import re
from functools import partial
from collections import Counter

from tqdm import tqdm, tqdm_notebook
from multiprocessing import cpu_count, Pool

import numpy as np
import pandas as pd

import spacy
import ftfy
import yake
from nltk.tokenize import TweetTokenizer

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_colwidth = 200 # default - 50

In [3]:
DATA_PATH = Path('../data')
RANDOM_SEED = 17

tqdm().pandas()

0it [00:00, ?it/s]


**Load Data**

In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
valid_df = pd.read_csv(DATA_PATH/'valid.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
unlabeled_df = pd.read_csv(DATA_PATH/'unlabeled.csv')

In [5]:
text_cols = ['title', 'text']

**Clean Data**

In [15]:
def parallelize(data, func, cores=None):
    if cores is None:
        cores = cpu_count()
    partitions = cores
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [7]:
X_train = train_df.fillna('xxempty')
X_valid = valid_df.fillna('xxempty')
X_test = test_df.fillna('xxempty')
X_unlabeled = unlabeled_df.fillna('xxempty')

**Fix unicode characters**

In [9]:
def parallel_fix_text(s):
    return s.progress_apply(fix_text)

def fix_text(doc):
    return ftfy.fix_text(doc, normalization='NFKD')

In [10]:
X_train_ftfy = pd.DataFrame()
X_train_ftfy['title'] = parallelize(X_train['title'], parallel_fix_text)
X_train_ftfy['text'] = parallelize(X_train['text'], parallel_fix_text)
X_train_ftfy['label'] = X_train['label']
X_train_ftfy.head()

100%|██████████| 3109/3109 [00:00<00:00, 18082.88it/s]
100%|██████████| 3109/3109 [00:00<00:00, 14655.11it/s]
100%|██████████| 3109/3109 [00:00<00:00, 13585.30it/s]
100%|██████████| 3109/3109 [00:00<00:00, 14463.69it/s]
100%|██████████| 3109/3109 [00:00<00:00, 13059.85it/s]
100%|██████████| 3108/3108 [00:00<00:00, 15450.81it/s]
100%|██████████| 3109/3109 [00:00<00:00, 11670.22it/s]
100%|██████████| 3109/3109 [00:00<00:00, 10584.75it/s]
100%|██████████| 3108/3108 [00:14<00:00, 220.74it/s]
100%|██████████| 3109/3109 [00:14<00:00, 220.35it/s]
100%|██████████| 3109/3109 [00:14<00:00, 210.16it/s]
100%|██████████| 3109/3109 [00:21<00:00, 145.86it/s]
100%|██████████| 3109/3109 [00:22<00:00, 137.96it/s]
100%|██████████| 3109/3109 [00:22<00:00, 136.78it/s]
100%|██████████| 3109/3109 [00:22<00:00, 136.31it/s]
100%|██████████| 3109/3109 [00:23<00:00, 135.05it/s]


Unnamed: 0,title,text,label
0,China and Economic Reform: Xi Jinping's Track Record,Economists generally agree: China must overhaul its huge but wasteful economy if it wants to continue to grow in the years to come. That means limiting political interference in banking and the fi...,news
1,Trade to Be a Big Topic in Theresa May's U.S. Visit,"LONDON—British Prime Minister Theresa May said she'll discuss trade and security in a coming meeting with President Donald Trump, his first visit from a foreign leader as president, underscoring t...",news
2,"The Top Beaches In The World, According To National Geographic","Beaches come in all sorts of shapes and sizes beyond the typical Caribbean postcard. As such, National Geographic's new list of the Top 21 Beaches in the World includes a diverse mix of shorelines...",clickbait
3,"Sheriff's Report Provides New Details on Tamir Rice's Death, but Leaves Questions","A timeline of what happened after Tamir Rice, a 12-year-old boy, was killed by a police officer in Cleveland last November. A lengthy report published Saturday revealed new details about the fatal...",clickbait
4,Surgeon claiming he will transplant volunteer's HEAD to another body says he needs America's help to do it,An Italian neurosurgeon who has claimed for months that he will perform the world's first human head transplant asked Americans to 'be Americans' and donate to his cause. Dr. Sergio Canavero's pl...,news


In [11]:
X_valid_ftfy = pd.DataFrame()
X_valid_ftfy['title'] = parallelize(X_valid['title'], parallel_fix_text)
X_valid_ftfy['text'] = parallelize(X_valid['text'], parallel_fix_text)
X_valid_ftfy['label'] = X_valid['label']
X_valid_ftfy.head()

100%|██████████| 444/444 [00:00<00:00, 10786.89it/s]
100%|██████████| 444/444 [00:00<00:00, 7868.94it/s]
100%|██████████| 444/444 [00:00<00:00, 10544.42it/s]
100%|██████████| 444/444 [00:00<00:00, 7532.00it/s]
100%|██████████| 444/444 [00:00<00:00, 9086.86it/s]
100%|██████████| 444/444 [00:00<00:00, 7266.21it/s]
100%|██████████| 444/444 [00:00<00:00, 7397.21it/s]
100%|██████████| 444/444 [00:00<00:00, 10311.98it/s]
100%|██████████| 444/444 [00:01<00:00, 229.51it/s]
100%|██████████| 444/444 [00:02<00:00, 219.00it/s]
100%|██████████| 444/444 [00:02<00:00, 214.51it/s]
100%|██████████| 444/444 [00:03<00:00, 141.89it/s]
100%|██████████| 444/444 [00:03<00:00, 136.87it/s]
100%|██████████| 444/444 [00:03<00:00, 136.14it/s]
100%|██████████| 444/444 [00:03<00:00, 131.78it/s]
100%|██████████| 444/444 [00:03<00:00, 129.41it/s]


Unnamed: 0,title,text,label
0,Trump says he is releasing something 'phenomenal in terms of tax' in 2 to 3 weeks,"Bob Bryan, Business Insider 9.02.2017, 16:25 182 facebook linkedin twitter email print ""We're going to be announcing something over the next, I would say, two or three weeks that will be phenomena...",news
1,Fidel Castro's ashes make their final journey across Cuba,Cubans have been lining the streets from Havana to Santiago to watch Fidel Castro's ashes make their final journey. The route is the reverse of that taken by Castro and his rebels to mark their vi...,news
2,Obama Administration Sending $500 Million to Global Climate Change Fund,"WASHINGTON—The Obama administration announced Tuesday it is funneling $500 million to a global fund to address climate change, one of the final acts President Barack Obama is taking to preserve hi...",news
3,Insurers Are Worried About The House GOP Health Care Bill,"The main industry groups representing health insurance companies have broken their silence on the Republican bill to repeal the Affordable Care Act and ""replace"" it with smaller reforms. America's...",news
4,Kobe Bryant and Nike Form Youth Basketball 'Mamba League' in Los Angeles,"A year after Kobe Bryant concluded his NBA career, Bryant and Nike have announced a partnership to establish a youth basketball program in Los Angeles. The Mamba League will instruct boys and gir...",news


In [12]:
X_test_ftfy = pd.DataFrame()
X_test_ftfy['title'] = parallelize(X_test['title'], parallel_fix_text)
X_test_ftfy['text'] = parallelize(X_test['text'], parallel_fix_text)
X_test_ftfy.head()

100%|██████████| 705/705 [00:00<00:00, 10488.55it/s]
100%|██████████| 706/706 [00:00<00:00, 8087.60it/s]
100%|██████████| 706/706 [00:00<00:00, 7889.34it/s]
100%|██████████| 706/706 [00:00<00:00, 7485.08it/s]
100%|██████████| 706/706 [00:00<00:00, 7551.21it/s]
100%|██████████| 706/706 [00:00<00:00, 6751.76it/s]
100%|██████████| 706/706 [00:00<00:00, 7701.94it/s]
100%|██████████| 706/706 [00:00<00:00, 8448.22it/s]
 85%|████████▍ | 596/705 [00:05<00:01, 89.39it/s]
100%|██████████| 706/706 [00:05<00:00, 123.83it/s]
100%|██████████| 706/706 [00:05<00:00, 118.37it/s]
100%|██████████| 706/706 [00:05<00:00, 122.29it/s]
100%|██████████| 706/706 [00:05<00:00, 122.59it/s]
100%|██████████| 706/706 [00:05<00:00, 118.04it/s]
100%|██████████| 706/706 [00:06<00:00, 116.20it/s]
100%|██████████| 705/705 [00:06<00:00, 113.44it/s]


Unnamed: 0,title,text
0,Amazon CEO Jeff Bezos is now the second richest man in the world,"More Try Yahoo Finance on Firefox » Amazon CEO Jeff Bezos is now the second richest man in the world after overtaking Amancio Ortega and Warren Buffett, according to Bloomberg's Billionaires Index..."
1,Does Laura Dern Handle a Lightsaber in the New 'Star Wars'? [Video],"More Laura Dern seems to be everywhere these days. That's because she is. She's the ferocious Renata in Big Little Lies, she's a recovering drug addict in Wilson, and she has two top-secret roles ..."
2,"In this photographer's home town, stepping out of the house is a risk","Kirkuk is a city of Northern Iraq in the Kurdish region of the country. Arabs, Kurds, Turkmen, Christians and foreign workers live beside one another. Back in the day, Saddam Hussein initiated sev..."
3,"8 Ways To Get Your Spouse To Open Up More, According To Therapists","Experts say that communication is the cornerstone of a good relationship. That's why it can be deeply troubling when your partner is closed off and guarded. How do you get them to open up? Below,..."
4,US says claim it supported IS in Syria is 'ludicrous',"Share this with Email Facebook Messenger Messenger Twitter Pinterest WhatsApp LinkedIn Copy this link The US has described as ""ludicrous"" a claim by Turkish President Recep Tayyip Erdogan that it ..."


In [204]:
X_unlabeled_ftfy = pd.DataFrame()
X_unlabeled_ftfy['title'] = parallelize(X_unlabeled['title'], parallel_fix_text)
X_unlabeled_ftfy['text'] = parallelize(X_unlabeled['text'], parallel_fix_text)
X_unlabeled_ftfy.head()

100%|██████████| 10002/10002 [00:01<00:00, 8659.38it/s]
100%|██████████| 10002/10002 [00:01<00:00, 8641.88it/s]
100%|██████████| 10002/10002 [00:01<00:00, 8243.80it/s]
100%|██████████| 10001/10001 [00:01<00:00, 8298.93it/s]
100%|██████████| 10002/10002 [00:01<00:00, 8092.99it/s]
100%|██████████| 10001/10001 [00:01<00:00, 8355.46it/s]
100%|██████████| 10001/10001 [00:01<00:00, 7953.47it/s]
100%|██████████| 10002/10002 [00:01<00:00, 7913.31it/s]
100%|██████████| 10002/10002 [01:31<00:00, 109.29it/s]
100%|██████████| 10002/10002 [01:32<00:00, 108.24it/s]
100%|██████████| 10002/10002 [01:32<00:00, 108.48it/s]
100%|██████████| 10001/10001 [01:32<00:00, 108.42it/s]
100%|██████████| 10002/10002 [01:33<00:00, 107.41it/s]
100%|██████████| 10001/10001 [01:33<00:00, 107.39it/s]
100%|██████████| 10002/10002 [01:34<00:00, 106.37it/s]
100%|██████████| 10001/10001 [01:33<00:00, 106.85it/s]


Unnamed: 0,title,text
0,"Freida Pinto, Allison Williams, Kuoth Wiel are a dream in David O. Russell's Prada film","Is David O. Russell's Prada short film the future of fashion advertising? Can the artistry of film sell clothing better and entertain? Let's hope so. Past Forward, an 18-minute short film out toda..."
1,India's Cash Crisis Isn't Modi's Only Headache - Bloomberg,"Connecting decision makers to a dynamic network of information, people and ideas, Bloomberg quickly and accurately delivers business and financial information, news and insight around the world. A..."
2,"12 years after her murder, DNA from teen victim's fingernails leads to suspect","NEW YORK – Ten years after her murder, the DNA beneath a teenager's fingernails has finally resulted in an indictment, reportsCBS New York. On Feb. 11, 2004, the naked body of 17-year-old Sharabia..."
3,"Thanks Kejriwal, Delhi Is Number 1 Now, On The List Of World's Most Polluted Cities!","Hey, Arvind Kejriwal, you did it - New Delhi is number 1. The number 1 most polluted city in the world. Air Quality Index measurements from the US Embassy clocked Delhi at 999 on Monday - which ..."
4,"""You heard it here first"": Man predicted Cubs World Series win in 1993 yearbook","It's a quote that has stuck with Los Angeles Dodgers fan Marcos Meza for decades: ""Chicago Cubs. 2016. World Champions. You heard it here first."" His classmate Mike Lee, a die-hard Chicago Cubs fa..."


**Save/Load Clean Data**

In [205]:
X_train_ftfy.to_csv(DATA_PATH/'X_train_ftfy_nfkd.csv', index=False)
X_valid_ftfy.to_csv(DATA_PATH/'X_valid_ftfy_nfkd.csv', index=False)
X_test_ftfy.to_csv(DATA_PATH/'X_test_ftfy_nfkd.csv', index=False)
X_unlabeled_ftfy.to_csv(DATA_PATH/'X_unlabeled_ftfy_nfkd.csv', index=False)

In [51]:
X_train_ftfy = pd.read_csv(DATA_PATH/'X_train_ftfy_nfkd.csv')
X_valid_ftfy = pd.read_csv(DATA_PATH/'X_valid_ftfy_nfkd.csv')
X_test_ftfy = pd.read_csv(DATA_PATH/'X_test_ftfy_nfkd.csv')
X_unlabeled_ftfy = pd.read_csv(DATA_PATH/'X_unlabeled_ftfy_nfkd.csv')

**Tokenize Data (Spacy)**

In [12]:
def parallel_tokenize(s):
    return s.progress_apply(tokenize)

def tokenize(doc):
    return ' '.join([token.text for token in nlp(doc)])

In [9]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])

In [13]:
X_train_spacy = pd.DataFrame()
X_train_spacy['title'] = parallelize(X_train_ftfy['title'], parallel_tokenize)
X_train_spacy['text'] = parallelize(X_train_ftfy['text'], parallel_tokenize)
X_train_spacy['label'] = X_train_ftfy['label']
X_train_spacy.head()

100%|██████████| 3108/3108 [00:00<00:00, 5507.70it/s]
100%|██████████| 3109/3109 [00:00<00:00, 3987.86it/s]
100%|██████████| 3109/3109 [00:00<00:00, 3965.41it/s]
100%|██████████| 3109/3109 [00:00<00:00, 4745.25it/s]
100%|██████████| 3109/3109 [00:00<00:00, 4692.17it/s]
100%|██████████| 3109/3109 [00:00<00:00, 3513.15it/s]
100%|██████████| 3109/3109 [00:00<00:00, 3274.74it/s]
100%|██████████| 3109/3109 [00:00<00:00, 3151.26it/s]
100%|██████████| 3109/3109 [00:09<00:00, 332.25it/s]
100%|██████████| 3109/3109 [00:09<00:00, 333.53it/s]
100%|██████████| 3108/3108 [00:09<00:00, 323.57it/s]
100%|██████████| 3109/3109 [00:14<00:00, 212.36it/s]
100%|██████████| 3109/3109 [00:15<00:00, 202.81it/s]
100%|██████████| 3109/3109 [00:15<00:00, 203.33it/s]
100%|██████████| 3109/3109 [00:15<00:00, 202.63it/s]
100%|██████████| 3109/3109 [00:15<00:00, 200.81it/s]


Unnamed: 0,title,text,label
0,China and Economic Reform : Xi Jinping 's Track Record,Economists generally agree : China must overhaul its huge but wasteful economy if it wants to continue to grow in the years to come . That means limiting political interference in banking and the ...,news
1,Trade to Be a Big Topic in Theresa May 's U.S. Visit,"LONDON — British Prime Minister Theresa May said she 'll discuss trade and security in a coming meeting with President Donald Trump , his first visit from a foreign leader as president , underscor...",news
2,"The Top Beaches In The World , According To National Geographic","Beaches come in all sorts of shapes and sizes beyond the typical Caribbean postcard . As such , National Geographic 's new list of the Top 21 Beaches in the World includes a diverse mix of shoreli...",clickbait
3,"Sheriff 's Report Provides New Details on Tamir Rice 's Death , but Leaves Questions","A timeline of what happened after Tamir Rice , a 12-year - old boy , was killed by a police officer in Cleveland last November . A lengthy report published Saturday revealed new details about the ...",clickbait
4,Surgeon claiming he will transplant volunteer 's HEAD to another body says he needs America 's help to do it,An Italian neurosurgeon who has claimed for months that he will perform the world 's first human head transplant asked Americans to ' be Americans ' and donate to his cause . Dr. Sergio Canavero...,news


In [14]:
X_valid_spacy = pd.DataFrame()
X_valid_spacy['title'] = parallelize(X_valid_ftfy['title'], parallel_tokenize)
X_valid_spacy['text'] = parallelize(X_valid_ftfy['text'], parallel_tokenize)
X_valid_spacy['label'] = X_valid_ftfy['label']
X_valid_spacy.head()

100%|██████████| 444/444 [00:00<00:00, 3889.15it/s]
100%|██████████| 444/444 [00:00<00:00, 2282.53it/s]
100%|██████████| 444/444 [00:00<00:00, 3143.65it/s]
100%|██████████| 444/444 [00:00<00:00, 2064.29it/s]
100%|██████████| 444/444 [00:00<00:00, 2024.77it/s]
100%|██████████| 444/444 [00:00<00:00, 2051.03it/s]
100%|██████████| 444/444 [00:00<00:00, 1905.73it/s]
100%|██████████| 444/444 [00:00<00:00, 2407.08it/s]
100%|██████████| 444/444 [00:01<00:00, 226.46it/s]
100%|██████████| 444/444 [00:02<00:00, 208.03it/s]
100%|██████████| 444/444 [00:02<00:00, 200.40it/s]
100%|██████████| 444/444 [00:03<00:00, 131.93it/s]
100%|██████████| 444/444 [00:03<00:00, 130.87it/s]
100%|██████████| 444/444 [00:03<00:00, 129.44it/s]
100%|██████████| 444/444 [00:03<00:00, 127.30it/s]
100%|██████████| 444/444 [00:03<00:00, 127.25it/s]


Unnamed: 0,title,text,label
0,Trump says he is releasing something ' phenomenal in terms of tax ' in 2 to 3 weeks,"Bob Bryan , Business Insider 9.02.2017 , 16:25 182 facebook linkedin twitter email print "" We 're going to be announcing something over the next , I would say , two or three weeks that will be phe...",news
1,Fidel Castro 's ashes make their final journey across Cuba,Cubans have been lining the streets from Havana to Santiago to watch Fidel Castro 's ashes make their final journey . The route is the reverse of that taken by Castro and his rebels to mark their ...,news
2,Obama Administration Sending $ 500 Million to Global Climate Change Fund,"WASHINGTON — The Obama administration announced Tuesday it is funneling $ 500 million to a global fund to address climate change , one of the final acts President Barack Obama is taking to preserv...",news
3,Insurers Are Worried About The House GOP Health Care Bill,"The main industry groups representing health insurance companies have broken their silence on the Republican bill to repeal the Affordable Care Act and "" replace "" it with smaller reforms . Americ...",news
4,Kobe Bryant and Nike Form Youth Basketball ' Mamba League ' in Los Angeles,"A year after Kobe Bryant concluded his NBA career , Bryant and Nike have announced a partnership to establish a youth basketball program in Los Angeles . The Mamba League will instruct boys and ...",news


In [15]:
X_test_spacy = pd.DataFrame()
X_test_spacy['title'] = parallelize(X_test_ftfy['title'], parallel_tokenize)
X_test_spacy['text'] = parallelize(X_test_ftfy['text'], parallel_tokenize)
X_test_spacy.head()

100%|██████████| 706/706 [00:00<00:00, 2408.21it/s]
100%|██████████| 706/706 [00:00<00:00, 2204.08it/s]
100%|██████████| 706/706 [00:00<00:00, 2132.28it/s]
100%|██████████| 706/706 [00:00<00:00, 2123.49it/s]
100%|██████████| 706/706 [00:00<00:00, 2095.95it/s]
100%|██████████| 705/705 [00:00<00:00, 2118.06it/s]
100%|██████████| 706/706 [00:00<00:00, 2099.61it/s]
100%|██████████| 706/706 [00:00<00:00, 1814.95it/s]
100%|██████████| 706/706 [00:04<00:00, 143.15it/s]
100%|██████████| 706/706 [00:05<00:00, 154.75it/s]
100%|██████████| 706/706 [00:05<00:00, 136.16it/s]
100%|██████████| 706/706 [00:05<00:00, 133.30it/s]
100%|██████████| 706/706 [00:05<00:00, 133.02it/s]
100%|██████████| 705/705 [00:05<00:00, 131.19it/s]
100%|██████████| 706/706 [00:05<00:00, 126.55it/s]
100%|██████████| 706/706 [00:05<00:00, 126.87it/s]


Unnamed: 0,title,text
0,Amazon CEO Jeff Bezos is now the second richest man in the world,"More Try Yahoo Finance on Firefox » Amazon CEO Jeff Bezos is now the second richest man in the world after overtaking Amancio Ortega and Warren Buffett , according to Bloomberg 's Billionaires Ind..."
1,Does Laura Dern Handle a Lightsaber in the New ' Star Wars ' ? [ Video ],"More Laura Dern seems to be everywhere these days . That 's because she is . She 's the ferocious Renata in Big Little Lies , she 's a recovering drug addict in Wilson , and she has two top - secr..."
2,"In this photographer 's home town , stepping out of the house is a risk","Kirkuk is a city of Northern Iraq in the Kurdish region of the country . Arabs , Kurds , Turkmen , Christians and foreign workers live beside one another . Back in the day , Saddam Hussein initiat..."
3,"8 Ways To Get Your Spouse To Open Up More , According To Therapists",Experts say that communication is the cornerstone of a good relationship . That 's why it can be deeply troubling when your partner is closed off and guarded . How do you get them to open up ? B...
4,US says claim it supported IS in Syria is ' ludicrous ',"Share this with Email Facebook Messenger Messenger Twitter Pinterest WhatsApp LinkedIn Copy this link The US has described as "" ludicrous "" a claim by Turkish President Recep Tayyip Erdogan that i..."


In [16]:
X_unlabeled_spacy = pd.DataFrame()
X_unlabeled_spacy['title'] = parallelize(X_unlabeled_ftfy['title'], parallel_tokenize)
X_unlabeled_spacy['text'] = parallelize(X_unlabeled_ftfy['text'], parallel_tokenize)
X_unlabeled_spacy.head()

100%|██████████| 10002/10002 [00:02<00:00, 4272.57it/s]
100%|██████████| 10002/10002 [00:02<00:00, 4090.24it/s]
100%|██████████| 10001/10001 [00:02<00:00, 4123.13it/s]
100%|██████████| 10002/10002 [00:02<00:00, 4040.90it/s]
100%|██████████| 10002/10002 [00:02<00:00, 4025.95it/s]
100%|██████████| 10002/10002 [00:02<00:00, 3972.03it/s]
100%|██████████| 10001/10001 [00:02<00:00, 3973.53it/s]
100%|██████████| 10001/10001 [00:02<00:00, 3861.33it/s]
100%|██████████| 10002/10002 [00:44<00:00, 224.59it/s]
100%|██████████| 10002/10002 [00:45<00:00, 221.42it/s]
100%|██████████| 10002/10002 [00:45<00:00, 218.62it/s]
100%|██████████| 10002/10002 [00:45<00:00, 219.13it/s]
100%|██████████| 10001/10001 [00:44<00:00, 222.69it/s]
100%|██████████| 10001/10001 [00:45<00:00, 219.36it/s]
100%|██████████| 10002/10002 [00:46<00:00, 217.29it/s]
100%|██████████| 10001/10001 [00:45<00:00, 218.64it/s]


Unnamed: 0,title,text
0,"Freida Pinto , Allison Williams , Kuoth Wiel are a dream in David O. Russell 's Prada film","Is David O. Russell 's Prada short film the future of fashion advertising ? Can the artistry of film sell clothing better and entertain ? Let 's hope so . Past Forward , an 18-minute short film ou..."
1,India 's Cash Crisis Is n't Modi 's Only Headache - Bloomberg,"Connecting decision makers to a dynamic network of information , people and ideas , Bloomberg quickly and accurately delivers business and financial information , news and insight around the world..."
2,"12 years after her murder , DNA from teen victim 's fingernails leads to suspect","NEW YORK – Ten years after her murder , the DNA beneath a teenager 's fingernails has finally resulted in an indictment , reportsCBS New York . On Feb. 11 , 2004 , the naked body of 17-year - old ..."
3,"Thanks Kejriwal , Delhi Is Number 1 Now , On The List Of World 's Most Polluted Cities !","Hey , Arvind Kejriwal , you did it - New Delhi is number 1 . The number 1 most polluted city in the world . Air Quality Index measurements from the US Embassy clocked Delhi at 999 on Monday - ..."
4,""" You heard it here first "" : Man predicted Cubs World Series win in 1993 yearbook","It 's a quote that has stuck with Los Angeles Dodgers fan Marcos Meza for decades : "" Chicago Cubs . 2016 . World Champions . You heard it here first . "" His classmate Mike Lee , a die - hard Chic..."


**Save/Load Clean Data**

In [17]:
X_train_spacy.to_csv(DATA_PATH/'X_train_ftfy_nfkd_spacy.csv', index=False)
X_valid_spacy.to_csv(DATA_PATH/'X_valid_ftfy_nfkd_spacy.csv', index=False)
X_test_spacy.to_csv(DATA_PATH/'X_test_ftfy_nfkd_spacy.csv', index=False)
X_unlabeled_spacy.to_csv(DATA_PATH/'X_unlabeled_ftfy_nfkd_spacy.csv', index=False)

In [211]:
X_train_spacy = pd.read_csv(DATA_PATH/'X_train_ftfy_nfkd_spacy.csv')
X_valid_spacy = pd.read_csv(DATA_PATH/'X_valid_ftfy_nfkd_spacy.csv')
X_test_spacy = pd.read_csv(DATA_PATH/'X_test_ftfy_nfkd_spacy.csv')
X_unlabeled_spacy = pd.read_csv(DATA_PATH/'X_unlabeled_ftfy_nfkd_spacy.csv')

**Tokenize Data (nltk)**

In [212]:
tokenizer = TweetTokenizer(reduce_len=True)
tokenizer

<nltk.tokenize.casual.TweetTokenizer at 0x7f2361b21080>

In [213]:
def parallel_tokenize(s):
    return s.progress_apply(tokenize)

def tokenize(doc):
    return ' '.join(tokenizer.tokenize(doc))

In [214]:
X_train_tweet = pd.DataFrame()
X_train_tweet['title'] = parallelize(X_train_ftfy['title'], parallel_tokenize)
X_train_tweet['text'] = parallelize(X_train_ftfy['text'], parallel_tokenize)
X_train_tweet['label'] = X_train_ftfy['label']
X_train_tweet.head()

100%|██████████| 3108/3108 [00:00<00:00, 40842.60it/s]
100%|██████████| 3109/3109 [00:00<00:00, 33512.78it/s]
100%|██████████| 3109/3109 [00:00<00:00, 21685.74it/s]
100%|██████████| 3109/3109 [00:00<00:00, 25469.13it/s]
100%|██████████| 3109/3109 [00:00<00:00, 16283.50it/s]
100%|██████████| 3109/3109 [00:00<00:00, 15651.38it/s]
100%|██████████| 3109/3109 [00:00<00:00, 14845.19it/s]
100%|██████████| 3109/3109 [00:00<00:00, 13404.84it/s]
100%|██████████| 3109/3109 [00:05<00:00, 606.46it/s]
100%|██████████| 3108/3108 [00:05<00:00, 560.26it/s]
100%|██████████| 3109/3109 [00:05<00:00, 527.38it/s]
100%|██████████| 3109/3109 [00:09<00:00, 344.98it/s]
100%|██████████| 3109/3109 [00:09<00:00, 340.18it/s]
100%|██████████| 3109/3109 [00:09<00:00, 321.72it/s]
100%|██████████| 3109/3109 [00:09<00:00, 321.86it/s]
100%|██████████| 3109/3109 [00:09<00:00, 313.14it/s]


Unnamed: 0,title,text,label
0,China and Economic Reform : Xi Jinping's Track Record,Economists generally agree : China must overhaul its huge but wasteful economy if it wants to continue to grow in the years to come . That means limiting political interference in banking and the ...,news
1,Trade to Be a Big Topic in Theresa May's U . S . Visit,"LONDON — British Prime Minister Theresa May said she'll discuss trade and security in a coming meeting with President Donald Trump , his first visit from a foreign leader as president , underscori...",news
2,"The Top Beaches In The World , According To National Geographic","Beaches come in all sorts of shapes and sizes beyond the typical Caribbean postcard . As such , National Geographic's new list of the Top 21 Beaches in the World includes a diverse mix of shorelin...",clickbait
3,"Sheriff's Report Provides New Details on Tamir Rice's Death , but Leaves Questions","A timeline of what happened after Tamir Rice , a 12 - year-old boy , was killed by a police officer in Cleveland last November . A lengthy report published Saturday revealed new details about the ...",clickbait
4,Surgeon claiming he will transplant volunteer's HEAD to another body says he needs America's help to do it,An Italian neurosurgeon who has claimed for months that he will perform the world's first human head transplant asked Americans to ' be Americans ' and donate to his cause . Dr . Sergio Canavero's...,news


In [30]:
X_valid_tweet = pd.DataFrame()
X_valid_tweet['title'] = parallelize(X_valid_ftfy['title'], parallel_tokenize)
X_valid_tweet['text'] = parallelize(X_valid_ftfy['text'], parallel_tokenize)
X_valid_tweet['label'] = X_valid_ftfy['label']
X_valid_tweet.head()

100%|██████████| 444/444 [00:00<00:00, 19525.98it/s]
100%|██████████| 444/444 [00:00<00:00, 15688.23it/s]
100%|██████████| 444/444 [00:00<00:00, 11326.17it/s]
100%|██████████| 444/444 [00:00<00:00, 9776.16it/s]
100%|██████████| 444/444 [00:00<00:00, 10578.50it/s]
100%|██████████| 444/444 [00:00<00:00, 13408.44it/s]
100%|██████████| 444/444 [00:00<00:00, 10098.48it/s]
100%|██████████| 444/444 [00:00<00:00, 16261.39it/s]
100%|██████████| 444/444 [00:00<00:00, 514.48it/s]
100%|██████████| 444/444 [00:00<00:00, 495.75it/s]
100%|██████████| 444/444 [00:00<00:00, 485.15it/s]
100%|██████████| 444/444 [00:01<00:00, 292.69it/s]
100%|██████████| 444/444 [00:01<00:00, 293.05it/s]
100%|██████████| 444/444 [00:01<00:00, 289.93it/s]
100%|██████████| 444/444 [00:01<00:00, 286.06it/s]
100%|██████████| 444/444 [00:01<00:00, 278.52it/s]


Unnamed: 0,title,text,label
0,Trump says he is releasing something ' phenomenal in terms of tax ' in 2 to 3 weeks,"Bob Bryan , Business Insider 9.02 . 2017 , 16:25 182 facebook linkedin twitter email print "" We're going to be announcing something over the next , I would say , two or three weeks that will be ph...",news
1,Fidel Castro's ashes make their final journey across Cuba,Cubans have been lining the streets from Havana to Santiago to watch Fidel Castro's ashes make their final journey . The route is the reverse of that taken by Castro and his rebels to mark their v...,news
2,Obama Administration Sending $ 500 Million to Global Climate Change Fund,"WASHINGTON — The Obama administration announced Tuesday it is funneling $ 500 million to a global fund to address climate change , one of the final acts President Barack Obama is taking to preserv...",news
3,Insurers Are Worried About The House GOP Health Care Bill,"The main industry groups representing health insurance companies have broken their silence on the Republican bill to repeal the Affordable Care Act and "" replace "" it with smaller reforms . Americ...",news
4,Kobe Bryant and Nike Form Youth Basketball ' Mamba League ' in Los Angeles,"A year after Kobe Bryant concluded his NBA career , Bryant and Nike have announced a partnership to establish a youth basketball program in Los Angeles . The Mamba League will instruct boys and gi...",news


In [31]:
X_test_tweet = pd.DataFrame()
X_test_tweet['title'] = parallelize(X_test_ftfy['title'], parallel_tokenize)
X_test_tweet['text'] = parallelize(X_test_ftfy['text'], parallel_tokenize)
X_test_tweet.head()

100%|██████████| 706/706 [00:00<00:00, 20574.17it/s]
100%|██████████| 706/706 [00:00<00:00, 13889.40it/s]
100%|██████████| 706/706 [00:00<00:00, 12243.05it/s]
100%|██████████| 706/706 [00:00<00:00, 13919.04it/s]
100%|██████████| 706/706 [00:00<00:00, 11587.11it/s]
100%|██████████| 706/706 [00:00<00:00, 14925.60it/s]
100%|██████████| 706/706 [00:00<00:00, 11122.76it/s]
100%|██████████| 705/705 [00:00<00:00, 12621.74it/s]
100%|██████████| 706/706 [00:01<00:00, 561.73it/s]
100%|██████████| 706/706 [00:01<00:00, 537.76it/s]
100%|██████████| 706/706 [00:01<00:00, 418.34it/s]
100%|██████████| 706/706 [00:01<00:00, 392.23it/s]
100%|██████████| 706/706 [00:01<00:00, 397.75it/s]
100%|██████████| 705/705 [00:01<00:00, 384.20it/s]
100%|██████████| 706/706 [00:01<00:00, 379.67it/s]
100%|██████████| 706/706 [00:01<00:00, 375.88it/s]


Unnamed: 0,title,text
0,Amazon CEO Jeff Bezos is now the second richest man in the world,"More Try Yahoo Finance on Firefox » Amazon CEO Jeff Bezos is now the second richest man in the world after overtaking Amancio Ortega and Warren Buffett , according to Bloomberg's Billionaires Inde..."
1,Does Laura Dern Handle a Lightsaber in the New ' Star Wars ' ? [ Video ],"More Laura Dern seems to be everywhere these days . That's because she is . She's the ferocious Renata in Big Little Lies , she's a recovering drug addict in Wilson , and she has two top-secret ro..."
2,"In this photographer's home town , stepping out of the house is a risk","Kirkuk is a city of Northern Iraq in the Kurdish region of the country . Arabs , Kurds , Turkmen , Christians and foreign workers live beside one another . Back in the day , Saddam Hussein initiat..."
3,"8 Ways To Get Your Spouse To Open Up More , According To Therapists",Experts say that communication is the cornerstone of a good relationship . That's why it can be deeply troubling when your partner is closed off and guarded . How do you get them to open up ? Belo...
4,US says claim it supported IS in Syria is ' ludicrous ',"Share this with Email Facebook Messenger Messenger Twitter Pinterest WhatsApp LinkedIn Copy this link The US has described as "" ludicrous "" a claim by Turkish President Recep Tayyip Erdogan that i..."


In [215]:
X_unlabeled_tweet = pd.DataFrame()
X_unlabeled_tweet['title'] = parallelize(X_unlabeled_ftfy['title'], parallel_tokenize)
X_unlabeled_tweet['text'] = parallelize(X_unlabeled_ftfy['text'], parallel_tokenize)
X_unlabeled_tweet.head()

100%|██████████| 10002/10002 [00:00<00:00, 13571.69it/s]
100%|██████████| 10002/10002 [00:00<00:00, 13168.32it/s]
100%|██████████| 10001/10001 [00:00<00:00, 13415.92it/s]
100%|██████████| 10002/10002 [00:00<00:00, 12432.71it/s]
100%|██████████| 10002/10002 [00:00<00:00, 12397.41it/s]
100%|██████████| 10001/10001 [00:00<00:00, 12611.75it/s]
100%|██████████| 10001/10001 [00:00<00:00, 12426.20it/s]
100%|██████████| 10002/10002 [00:00<00:00, 11819.49it/s]
100%|██████████| 10002/10002 [00:36<00:00, 271.39it/s]
100%|██████████| 10002/10002 [00:37<00:00, 267.71it/s]
100%|██████████| 10002/10002 [00:37<00:00, 266.03it/s]
100%|██████████| 10002/10002 [00:37<00:00, 266.72it/s]
100%|██████████| 10001/10001 [00:37<00:00, 269.82it/s]
100%|██████████| 10002/10002 [00:37<00:00, 267.07it/s]
100%|██████████| 10001/10001 [00:37<00:00, 265.73it/s]
100%|██████████| 10001/10001 [00:37<00:00, 263.30it/s]


Unnamed: 0,title,text
0,"Freida Pinto , Allison Williams , Kuoth Wiel are a dream in David O . Russell's Prada film","Is David O . Russell's Prada short film the future of fashion advertising ? Can the artistry of film sell clothing better and entertain ? Let's hope so . Past Forward , an 18 - minute short film o..."
1,India's Cash Crisis Isn't Modi's Only Headache - Bloomberg,"Connecting decision makers to a dynamic network of information , people and ideas , Bloomberg quickly and accurately delivers business and financial information , news and insight around the world..."
2,"12 years after her murder , DNA from teen victim's fingernails leads to suspect","NEW YORK – Ten years after her murder , the DNA beneath a teenager's fingernails has finally resulted in an indictment , reportsCBS New York . On Feb . 11 , 2004 , the naked body of 17 - year-old ..."
3,"Thanks Kejriwal , Delhi Is Number 1 Now , On The List Of World's Most Polluted Cities !","Hey , Arvind Kejriwal , you did it - New Delhi is number 1 . The number 1 most polluted city in the world . Air Quality Index measurements from the US Embassy clocked Delhi at 999 on Monday - whic..."
4,""" You heard it here first "" : Man predicted Cubs World Series win in 1993 yearbook","It's a quote that has stuck with Los Angeles Dodgers fan Marcos Meza for decades : "" Chicago Cubs . 2016 . World Champions . You heard it here first . "" His classmate Mike Lee , a die-hard Chicago..."


**Save/Load Clean Data**

In [None]:
X_train_tweet.to_csv(DATA_PATH/'X_train_ftfy_nfkd_tweet.csv', index=False)
X_valid_tweet.to_csv(DATA_PATH/'X_valid_ftfy_nfkd_tweet.csv', index=False)
X_test_tweet.to_csv(DATA_PATH/'X_test_ftfy_nfkd_tweet.csv', index=False)
X_unlabeled_tweet.to_csv(DATA_PATH/'X_unlabeled_ftfy_nfkd_tweet.csv', index=False)

In [6]:
X_train_tweet = pd.read_csv(DATA_PATH/'X_train_ftfy_nfkd_tweet.csv')
X_valid_tweet = pd.read_csv(DATA_PATH/'X_valid_ftfy_nfkd_tweet.csv')
X_test_tweet = pd.read_csv(DATA_PATH/'X_test_ftfy_nfkd_tweet.csv')
X_unlabeled_tweet = pd.read_csv(DATA_PATH/'X_unlabeled_ftfy_nfkd_tweet.csv')

**Keyword Extraction**

In [47]:
tqdm().pandas()

0it [00:00, ?it/s]


In [120]:
kw_extractor = yake.KeywordExtractor(n=2)

In [121]:
def parallel_tokenize(s):
    return s.progress_apply(tokenize)

def tokenize(doc):
    return kw_extractor.extract_keywords(doc)

In [None]:
X_valid_keywords = pd.DataFrame()
X_valid_keywords['title'] = parallelize(X_valid_tweet['title'], parallel_tokenize)
X_valid_keywords['text'] = parallelize(X_valid_tweet['text'], parallel_tokenize)
X_valid_keywords['label'] = X_valid_tweet['label']
X_valid_keywords.head()

In [None]:
X_test_keywords = pd.DataFrame()
X_test_keywords['title'] = parallelize(X_test_tweet['title'], parallel_tokenize)
X_test_keywords['text'] = parallelize(X_test_tweet['text'], parallel_tokenize)
X_test_keywords['label'] = X_test_tweet['label']
X_test_keywords.head()

In [160]:
def replace_spaces(s, repl='_'):
    return s.replace(' ', repl)

In [164]:
valid_keywords = X_valid_keywords['title'].apply(lambda x: list(zip(*x))).str[0].fillna('')
valid_keywords = valid_keywords.apply(lambda x: ' '.join(list(map(replace_spaces, x))))
valid_keywords = valid_keywords.fillna('no_keywords')
valid_keywords.head()

0                                                                                                               weeks trump phenomenal tax releasing terms
1                                                            fidel_castro castro_ashes cuba ashes_make final_journey castro fidel ashes make final journey
2    administration_sending change_fund global_climate climate_change obama_administration sending million fund administration global climate change obama
3                                                                   care_bill house_gop gop_health health_care bill worried house gop health care insurers
4               mamba_league youth_basketball los_angeles nike_form form_youth kobe_bryant basketball mamba league angeles bryant nike form youth los kobe
Name: title, dtype: object

In [164]:
df = X_valid_keywords['title'].apply(lambda x: list(zip(*x))).str[0].fillna('')
df = df.apply(lambda x: ' '.join(list(map(replace_spaces, x))))
df = df.fillna('no_keywords')
df.head()

0                                                                                                               weeks trump phenomenal tax releasing terms
1                                                            fidel_castro castro_ashes cuba ashes_make final_journey castro fidel ashes make final journey
2    administration_sending change_fund global_climate climate_change obama_administration sending million fund administration global climate change obama
3                                                                   care_bill house_gop gop_health health_care bill worried house gop health care insurers
4               mamba_league youth_basketball los_angeles nike_form form_youth kobe_bryant basketball mamba league angeles bryant nike form youth los kobe
Name: title, dtype: object

In [165]:
X_valid['keywords'] = df
X_valid.head()

Unnamed: 0,label,title,text,keywords
0,news,Trump says he is releasing something 'phenomenal in terms of tax' in 2 to 3 weeks,"Bob Bryan, Business Insider 9.02.2017, 16:25 182 facebook linkedin twitter email print ""We're going to be announcing something over the next, I would say, two or three weeks that will be phenomena...",weeks trump phenomenal tax releasing terms
1,news,Fidel Castro's ashes make their final journey across Cuba,Cubans have been lining the streets from Havana to Santiago to watch Fidel Castro's ashes make their final journey. The route is the reverse of that taken by Castro and his rebels to mark their vi...,fidel_castro castro_ashes cuba ashes_make final_journey castro fidel ashes make final journey
2,news,Obama Administration Sending $500 Million to Global Climate Change Fund,"WASHINGTON—The Obama administration announced Tuesday it is funneling $500 million to a global fund to address climate change, one of the final acts President Barack Obama is taking to preserve hi...",administration_sending change_fund global_climate climate_change obama_administration sending million fund administration global climate change obama
3,news,Insurers Are Worried About The House GOP Health Care Bill,The main industry groups representing health insurance companies have broken their silence on the Republican bill to repeal the Affordable Care Act and “replace” it with smaller reforms. America’s...,care_bill house_gop gop_health health_care bill worried house gop health care insurers
4,news,Kobe Bryant and Nike Form Youth Basketball 'Mamba League' in Los Angeles,"A year after Kobe Bryant concluded his NBA career, Bryant and Nike have announced a partnership to establish a youth basketball program in Los Angeles. The Mamba League will instruct boys and gir...",mamba_league youth_basketball los_angeles nike_form form_youth kobe_bryant basketball mamba league angeles bryant nike form youth los kobe


In [176]:
label_keywords = X_valid.groupby('label')['keywords'].apply(lambda x: ' '.join(x))

label
clickbait    fake_french facebook_targets french_accounts targets fake election french accounts facebook maternity_paranoia boss_made paranoia boss made mad suffering maternity rock_gods gods rock twilight glo...
news         weeks trump phenomenal tax releasing terms fidel_castro castro_ashes cuba ashes_make final_journey castro fidel ashes make final journey administration_sending change_fund global_climate climate_c...
other        xxempty xxempty xxempty xxempty xxempty xxempty xxempty abercrombie_online abercrombie online xxempty xxempty xxempty xxempty xxempty outlook batch vcards xxempty xxempty prescient_comment prescie...
Name: keywords, dtype: object

In [188]:
clickbait_keywords = Counter(label_keywords['clickbait'].split())
news_keywords = Counter(label_keywords['news'].split())
other_keywords = Counter(label_keywords['other'].split())

print(clickbait_keywords.most_common(100))
print(news_keywords.most_common(100))
print(other_keywords.most_common(100))

[('trump', 60), ('people', 22), ('donald', 19), ('donald_trump', 18), ('things', 14), ('world', 14), ('women', 13), ('make', 13), ('video', 12), ('america', 11), ('year', 11), ('list', 10), ('obama', 10), ('life', 10), ('watch', 9), ('love', 9), ('christmas', 9), ('man', 9), ('president', 9), ('day', 8), ('big', 8), ('woman', 8), ('made', 7), ('clinton', 7), ('photos', 7), ('war', 7), ('show', 7), ('money', 7), ('facebook', 6), ('white', 6), ('time', 6), ('week', 6), ('hillary', 6), ('home', 6), ('power', 6), ('men', 6), ('stop', 6), ('today', 6), ('iphone', 6), ('fake', 5), ('black', 5), ('reveal', 5), ('game', 5), ('happy', 5), ('hillary_clinton', 5), ('nfl', 5), ('sex', 5), ('tax', 5), ('movie', 5), ('russia', 5), ('house', 5), ('finds', 5), ('give', 5), ('moments', 5), ('college', 5), ('lost', 5), ('police', 5), ('death', 5), ('live', 5), ('signs', 5), ('espn', 5), ('election', 4), ('finally', 4), ('street', 4), ('young', 4), ('march', 4), ('americans', 4), ('hot', 4), ('dating', 4

**Save/Load Clean Data**

In [32]:
X_train_keywords.to_csv(DATA_PATH/'X_train_ftfy_nfkd_tweet_keywords.csv', index=False)
X_valid_keywords.to_csv(DATA_PATH/'X_valid_ftfy_nfkd_tweet_keywords.csv', index=False)
X_test_keywords.to_csv(DATA_PATH/'X_test_ftfy_nfkd_tweet_keywords.csv', index=False)

In [33]:
X_train_keywords = pd.read_csv(DATA_PATH/'X_train_ftfy_nfkd_tweet_keywords.csv')
X_valid_keywords = pd.read_csv(DATA_PATH/'X_valid_ftfy_nfkd_tweet_keywords.csv')
X_test_keywords = pd.read_csv(DATA_PATH/'X_test_ftfy_nfkd_tweet_keywords_keywords.csv')

**Spelling Correction**

In [7]:
from symspellpy.symspellpy import SymSpell 

In [8]:
max_edit_distance_dictionary = 2
prefix_length = 7

sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

In [9]:
sym_spell.create_dictionary(DATA_PATH/'X_train_ftfy_nfkd_tweet.csv')
sym_spell.word_count

147362

In [10]:
sym_spell.load_dictionary(DATA_PATH/'frequency_dictionary_en_82_765.txt', 0, 1)
sym_spell.word_count

174203

In [11]:
sym_spell.lookup_compound('ehllo wolrd!', max_edit_distance=2)[0].term

'hello world'

In [12]:
sym_spell.word_segmentation('mymistake_it_is_not_possible', max_edit_distance=2)

Composition(segmented_string='my mistake_ it_is _not _possible', corrected_string='my mistake this not possible', distance_sum=9, log_prob_sum=-16.972527602283865)

In [13]:
def parallel_tokenize(s):
    return s.progress_apply(correct_spell)

def correct_spell(s):
    return sym_spell.lookup_compound(s, max_edit_distance=2)[0].term

In [16]:
X_train_tweet['correct_title'] = parallelize(X_train_tweet['title'], parallel_tokenize)

100%|██████████| 3109/3109 [01:14<00:00, 41.70it/s]
100%|██████████| 3109/3109 [01:15<00:00, 41.12it/s]
100%|██████████| 3108/3108 [01:16<00:00, 40.88it/s]
100%|██████████| 3109/3109 [02:29<00:00, 20.84it/s]
100%|██████████| 3109/3109 [02:36<00:00, 19.82it/s]
100%|██████████| 3109/3109 [02:37<00:00, 19.76it/s]
100%|██████████| 3109/3109 [02:37<00:00, 19.77it/s] 
100%|██████████| 3109/3109 [02:49<00:00, 18.37it/s]


In [17]:
X_train_tweet[X_train_tweet['correct_title'] != X_train_tweet['title']]

Unnamed: 0,title,text,label,correct_title
0,China and Economic Reform : Xi Jinping's Track Record,Economists generally agree : China must overhaul its huge but wasteful economy if it wants to continue to grow in the years to come . That means limiting political interference in banking and the ...,news,china and economic reform xi jinping's track record
1,Trade to Be a Big Topic in Theresa May's U . S . Visit,"LONDON — British Prime Minister Theresa May said she'll discuss trade and security in a coming meeting with President Donald Trump , his first visit from a foreign leader as president , underscori...",news,trade to be a big topic in theresa may's u s visit
2,"The Top Beaches In The World , According To National Geographic","Beaches come in all sorts of shapes and sizes beyond the typical Caribbean postcard . As such , National Geographic's new list of the Top 21 Beaches in the World includes a diverse mix of shorelin...",clickbait,the top beaches in the world according to national geographic
3,"Sheriff's Report Provides New Details on Tamir Rice's Death , but Leaves Questions","A timeline of what happened after Tamir Rice , a 12 - year-old boy , was killed by a police officer in Cleveland last November . A lengthy report published Saturday revealed new details about the ...",clickbait,sheriff's report provides new details on tamir rice's death but leaves questions
4,Surgeon claiming he will transplant volunteer's HEAD to another body says he needs America's help to do it,An Italian neurosurgeon who has claimed for months that he will perform the world's first human head transplant asked Americans to ' be Americans ' and donate to his cause . Dr . Sergio Canavero's...,news,surgeon claiming he will transplant volunteer's head to another body says he needs america's help to do it
5,This Is How Differently Priced Spanx Can Actually Make Your Body Look,"Spanx does not do the thing I thought it did . 2 . Hi , I'm Kristin . Like many ladies on this lil ' dirt planet , I've believed my whole life that if you have ANY squishable body parts , the key ...",news,this is how differently priced spanx can actually make your body look
6,Samantha Bee and Jane Pauley Are Breaking the News,""" Jane Pauley ! You're all in leather , "" said Samantha Bee , as Ms . Pauley walked into the private dining room at Asiate restaurant in the Mandarin Oriental hotel in New York . And she was : in ...",news,samantha bee and jane pauley are breaking the news
7,Krauthammer : Syria Strike ' Total Contradiction to Bannonism ' Insider,"Charles Krauthammer called President Trump's 59 - missile strike on a Syrian airbase "" a total contradiction to Bannonism . "" Krauthammer said White House adviser Stephen Bannon trumpeted a theme ...",news,krauthammer syria strike total contradiction to bannonism insider
8,Rust Belt voters made Trump president . Now they want jobs,"The United States is making more things than ever before . Yes , you read that right . Manufacturing output is at an all-time high , according to one government statistic ( others indicate it's ne...",news,rust belt voters made trump president now they want jobs
9,"As Illegal Outpost Cleared , Israeli PM Netanyahu Announces New West Bank Settlement","As Jewish settlers and protesters were removed from an illegal outpost in the occupied West Bank overnight , Prime Minister Benjamin Netanyahu announced plans to build the first new settlement in ...",news,as illegal outpost cleared israeli pm netanyahu announces new west bank settlement
