In [1]:
from pathlib import Path
import pickle
import re
from functools import partial

from tqdm import tqdm, tqdm_notebook
from multiprocessing import cpu_count, Pool

import numpy as np
import pandas as pd

import spacy
import ftfy
import yake
from nltk.tokenize import TweetTokenizer

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_colwidth = 200 # default - 50

In [3]:
DATA_PATH = Path('../data')
RANDOM_SEED = 17

tqdm().pandas()

0it [00:00, ?it/s]


**Load Data**

In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
valid_df = pd.read_csv(DATA_PATH/'valid.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')

In [5]:
text_cols = ['title', 'text']

**Clean Data**

In [6]:
def parallelize(data, func, cores=None):
    if cores is None:
        cores = cpu_count()
    partitions = cores
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

In [7]:
X_train = train_df.fillna('')
X_valid = valid_df.fillna('')
X_test = test_df.fillna('')

**Fix unicode characters**

In [8]:
def parallel_fix_text(s, normalization='NFKC'):
    return s.progress_apply(fix_text)

def fix_text(doc):
    return ftfy.fix_text(doc)

In [9]:
X_train_ftfy = pd.DataFrame()
X_train_ftfy['clean_title'] = parallelize(X_train['title'], parallel_fix_text)
X_train_ftfy['clean_text'] = parallelize(X_train['text'], parallel_fix_text)
X_train_ftfy.head()

100%|██████████| 3109/3109 [00:00<00:00, 22100.91it/s]
100%|██████████| 3108/3108 [00:00<00:00, 18084.56it/s]
100%|██████████| 3109/3109 [00:00<00:00, 17231.61it/s]
100%|██████████| 3109/3109 [00:00<00:00, 13246.69it/s]
100%|██████████| 3109/3109 [00:00<00:00, 12758.49it/s]
100%|██████████| 3109/3109 [00:00<00:00, 11046.03it/s]
100%|██████████| 3109/3109 [00:00<00:00, 9337.93it/s]
100%|██████████| 3109/3109 [00:00<00:00, 9167.28it/s]
100%|██████████| 3108/3108 [00:14<00:00, 216.37it/s]
100%|██████████| 3109/3109 [00:14<00:00, 209.88it/s]
100%|██████████| 3109/3109 [00:14<00:00, 209.19it/s]
100%|██████████| 3109/3109 [00:21<00:00, 141.99it/s]
100%|██████████| 3109/3109 [00:23<00:00, 133.49it/s]
100%|██████████| 3109/3109 [00:23<00:00, 132.90it/s]
100%|██████████| 3109/3109 [00:23<00:00, 131.09it/s]
100%|██████████| 3109/3109 [00:23<00:00, 130.94it/s]


Unnamed: 0,clean_title,clean_text
0,China and Economic Reform: Xi Jinping's Track Record,Economists generally agree: China must overhaul its huge but wasteful economy if it wants to continue to grow in the years to come. That means limiting political interference in banking and the fi...
1,Trade to Be a Big Topic in Theresa May's U.S. Visit,"LONDON—British Prime Minister Theresa May said she'll discuss trade and security in a coming meeting with President Donald Trump, his first visit from a foreign leader as president, underscoring t..."
2,"The Top Beaches In The World, According To National Geographic","Beaches come in all sorts of shapes and sizes beyond the typical Caribbean postcard. As such, National Geographic's new list of the Top 21 Beaches in the World includes a diverse mix of shorelines..."
3,"Sheriff's Report Provides New Details on Tamir Rice's Death, but Leaves Questions","A timeline of what happened after Tamir Rice, a 12-year-old boy, was killed by a police officer in Cleveland last November. A lengthy report published Saturday revealed new details about the fatal..."
4,Surgeon claiming he will transplant volunteer's HEAD to another body says he needs America's help to do it,An Italian neurosurgeon who has claimed for months that he will perform the world's first human head transplant asked Americans to 'be Americans' and donate to his cause. Dr. Sergio Canavero's pl...


In [10]:
diff_df = pd.concat([X_train_ftfy, X_train], axis=1)
diff_df = diff_df[diff_df['clean_title'] != diff_df['title']]
diff_df.shape

(3397, 5)

In [11]:
diff_df.head()

Unnamed: 0,clean_title,clean_text,label,title,text
0,China and Economic Reform: Xi Jinping's Track Record,Economists generally agree: China must overhaul its huge but wasteful economy if it wants to continue to grow in the years to come. That means limiting political interference in banking and the fi...,news,China and Economic Reform: Xi Jinping’s Track Record,Economists generally agree: China must overhaul its huge but wasteful economy if it wants to continue to grow in the years to come. That means limiting political interference in banking and the fi...
1,Trade to Be a Big Topic in Theresa May's U.S. Visit,"LONDON—British Prime Minister Theresa May said she'll discuss trade and security in a coming meeting with President Donald Trump, his first visit from a foreign leader as president, underscoring t...",news,Trade to Be a Big Topic in Theresa May’s U.S. Visit,"LONDON—British Prime Minister Theresa May said she’ll discuss trade and security in a coming meeting with President Donald Trump, his first visit from a foreign leader as president, underscoring t..."
3,"Sheriff's Report Provides New Details on Tamir Rice's Death, but Leaves Questions","A timeline of what happened after Tamir Rice, a 12-year-old boy, was killed by a police officer in Cleveland last November. A lengthy report published Saturday revealed new details about the fatal...",clickbait,"Sheriff’s Report Provides New Details on Tamir Rice’s Death, but Leaves Questions","A timeline of what happened after Tamir Rice, a 12-year-old boy, was killed by a police officer in Cleveland last November. A lengthy report published Saturday revealed new details about the fatal..."
11,Trump's Mar-a-Lago Hit With 13 Health Violations in January,President Donald Trump may want to consider swinging by the kitchen when he visits his plush Mar-a-Lago resort this weekend. Florida health inspectors found more than a dozen violations during a J...,news,Trump’s Mar-a-Lago Hit With 13 Health Violations in January,President Donald Trump may want to consider swinging by the kitchen when he visits his plush Mar-a-Lago resort this weekend. Florida health inspectors found more than a dozen violations during a J...
24,EXCLUSIVE: 'I'm not going crazy to cut my head off here': Disabled 'guinea pig' for first full HEAD transplant to meet surgeon for first time this week and insists he'll only go under knife when i...,The man volunteering to be the guinea pig for pioneering head transplant surgery is flying to America this week to meet for the first time the doctor intending to give him a new body. Russian Vale...,news,EXCLUSIVE: 'I'm not going crazy to cut my head off here': Disabled ‘guinea pig’ for first full HEAD transplant to meet surgeon for first time this week and insists he'll only go under knife when i...,The man volunteering to be the guinea pig for pioneering head transplant surgery is flying to America this week to meet for the first time the doctor intending to give him a new body. Russian Vale...


In [12]:
top_diff_df = (diff_df['clean_title'].str.len() - diff_df['title'].str.len()).abs().sort_values(ascending=False).head(10)
top_diff_df

2360    6
128     4
8375    4
2861    4
7128    4
3256    4
2949    4
5574    4
7053    3
8223    3
dtype: int64

In [13]:
X_valid_ftfy = pd.DataFrame()
X_valid_ftfy['clean_title'] = parallelize(X_valid['title'], parallel_fix_text)
X_valid_ftfy['clean_text'] = parallelize(X_valid['text'], parallel_fix_text)
X_valid_ftfy.head()

100%|██████████| 444/444 [00:00<00:00, 11798.25it/s]
100%|██████████| 444/444 [00:00<00:00, 10942.43it/s]
100%|██████████| 444/444 [00:00<00:00, 7218.50it/s]
100%|██████████| 444/444 [00:00<00:00, 11558.21it/s]
100%|██████████| 444/444 [00:00<00:00, 7009.61it/s]
100%|██████████| 444/444 [00:00<00:00, 7751.32it/s]
100%|██████████| 444/444 [00:00<00:00, 7127.93it/s]
100%|██████████| 444/444 [00:00<00:00, 6517.61it/s]
100%|██████████| 444/444 [00:02<00:00, 208.51it/s]
100%|██████████| 444/444 [00:02<00:00, 207.87it/s]
100%|██████████| 444/444 [00:02<00:00, 197.63it/s]
100%|██████████| 444/444 [00:03<00:00, 135.98it/s]
100%|██████████| 444/444 [00:03<00:00, 133.57it/s]
100%|██████████| 444/444 [00:03<00:00, 126.80it/s]
100%|██████████| 444/444 [00:03<00:00, 127.01it/s]
100%|██████████| 444/444 [00:03<00:00, 121.34it/s]


Unnamed: 0,clean_title,clean_text
0,Trump says he is releasing something 'phenomenal in terms of tax' in 2 to 3 weeks,"Bob Bryan, Business Insider 9.02.2017, 16:25 182 facebook linkedin twitter email print ""We're going to be announcing something over the next, I would say, two or three weeks that will be phenomena..."
1,Fidel Castro's ashes make their final journey across Cuba,Cubans have been lining the streets from Havana to Santiago to watch Fidel Castro's ashes make their final journey. The route is the reverse of that taken by Castro and his rebels to mark their vi...
2,Obama Administration Sending $500 Million to Global Climate Change Fund,"WASHINGTON—The Obama administration announced Tuesday it is funneling $500 million to a global fund to address climate change, one of the final acts President Barack Obama is taking to preserve hi..."
3,Insurers Are Worried About The House GOP Health Care Bill,"The main industry groups representing health insurance companies have broken their silence on the Republican bill to repeal the Affordable Care Act and ""replace"" it with smaller reforms. America's..."
4,Kobe Bryant and Nike Form Youth Basketball 'Mamba League' in Los Angeles,"A year after Kobe Bryant concluded his NBA career, Bryant and Nike have announced a partnership to establish a youth basketball program in Los Angeles. The Mamba League will instruct boys and gir..."


In [14]:
X_test_ftfy = pd.DataFrame()
X_test_ftfy['clean_title'] = parallelize(X_test['title'], parallel_fix_text)
X_test_ftfy['clean_text'] = parallelize(X_test['text'], parallel_fix_text)
X_test_ftfy.head()

100%|██████████| 706/706 [00:00<00:00, 7253.95it/s]
100%|██████████| 706/706 [00:00<00:00, 7249.60it/s]
100%|██████████| 706/706 [00:00<00:00, 7010.30it/s]
100%|██████████| 706/706 [00:00<00:00, 7000.31it/s]
100%|██████████| 706/706 [00:00<00:00, 7267.25it/s]
100%|██████████| 706/706 [00:00<00:00, 6911.81it/s]
100%|██████████| 706/706 [00:00<00:00, 7132.17it/s]
100%|██████████| 705/705 [00:00<00:00, 6327.13it/s]
100%|██████████| 706/706 [00:06<00:00, 115.60it/s]
100%|██████████| 706/706 [00:06<00:00, 114.49it/s]
100%|██████████| 706/706 [00:06<00:00, 112.11it/s]
100%|██████████| 706/706 [00:06<00:00, 111.09it/s]
100%|██████████| 706/706 [00:06<00:00, 108.51it/s]
100%|██████████| 706/706 [00:06<00:00, 107.73it/s]
100%|██████████| 706/706 [00:06<00:00, 107.11it/s]
100%|██████████| 705/705 [00:06<00:00, 105.63it/s]


Unnamed: 0,clean_title,clean_text
0,Amazon CEO Jeff Bezos is now the second richest man in the world,"More Try Yahoo Finance on Firefox » Amazon CEO Jeff Bezos is now the second richest man in the world after overtaking Amancio Ortega and Warren Buffett, according to Bloomberg's Billionaires Index..."
1,Does Laura Dern Handle a Lightsaber in the New 'Star Wars'? [Video],"More Laura Dern seems to be everywhere these days. That's because she is. She's the ferocious Renata in Big Little Lies, she's a recovering drug addict in Wilson, and she has two top-secret roles ..."
2,"In this photographer's home town, stepping out of the house is a risk","Kirkuk is a city of Northern Iraq in the Kurdish region of the country. Arabs, Kurds, Turkmen, Christians and foreign workers live beside one another. Back in the day, Saddam Hussein initiated sev..."
3,"8 Ways To Get Your Spouse To Open Up More, According To Therapists","Experts say that communication is the cornerstone of a good relationship. That's why it can be deeply troubling when your partner is closed off and guarded. How do you get them to open up? Below,..."
4,US says claim it supported IS in Syria is 'ludicrous',"Share this with Email Facebook Messenger Messenger Twitter Pinterest WhatsApp LinkedIn Copy this link The US has described as ""ludicrous"" a claim by Turkish President Recep Tayyip Erdogan that it ..."


**Save/Load Clean Data**

In [15]:
with open(DATA_PATH/'X_train_ftfy.pkl', 'wb') as  f:
    pickle.dump(X_train_ftfy, f)

with open(DATA_PATH/'X_valid_ftfy.pkl', 'wb') as  f:
    pickle.dump(X_valid_ftfy, f)
    
with open(DATA_PATH/'X_test_ftfy.pkl', 'wb') as  f:
    pickle.dump(X_test_ftfy, f)

In [16]:
((X_train_ftfy['clean_title'] != X_train['title']).sum(),
 (X_train_ftfy['clean_text'] != X_train['text']).sum(),
 (X_valid_ftfy['clean_title'] != X_valid['title']).sum(),
 (X_valid_ftfy['clean_text'] != X_valid['text']).sum(),
 (X_test_ftfy['clean_title'] != X_test['title']).sum(),
 (X_test_ftfy['clean_text'] != X_test['text']).sum())

(3397, 12215, 455, 1712, 967, 3501)

**Tokenize Data (Spacy)**

In [26]:
def parallel_tokenize(s):
    return s.progress_apply(tokenize)

def tokenize(doc):
    return ' '.join(['_'.join([token.text, token.pos_]) for token in nlp(doc)])

In [31]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner']) #, disable=['parser', 'tagger', 'ner'])

In [32]:
X_train_ftfy_spacy = pd.DataFrame()
X_train_ftfy_spacy['clean_title'] = parallelize(X_train_ftfy['clean_title'], parallel_tokenize)
X_train_ftfy_spacy['clean_text'] = parallelize(X_train_ftfy['clean_text'], parallel_tokenize)
X_train_ftfy_spacy.head()

100%|██████████| 3109/3109 [01:31<00:00, 33.82it/s]
100%|██████████| 3108/3108 [01:32<00:00, 33.67it/s]
100%|██████████| 3109/3109 [01:33<00:00, 33.09it/s]
100%|██████████| 3109/3109 [01:39<00:00, 31.34it/s]
100%|██████████| 3109/3109 [01:39<00:00, 31.26it/s]
100%|██████████| 3109/3109 [01:39<00:00, 31.16it/s]
 97%|█████████▋| 3004/3109 [01:40<00:00, 176.76it/s]
100%|██████████| 3109/3109 [01:40<00:00, 30.90it/s] 
100%|██████████| 3108/3108 [04:26<00:00, 11.67it/s]
100%|██████████| 3109/3109 [04:33<00:00, 11.37it/s]
100%|██████████| 3109/3109 [04:35<00:00, 11.27it/s]
100%|██████████| 3109/3109 [06:19<00:00,  8.18it/s]
100%|██████████| 3109/3109 [06:33<00:00, 13.46it/s]
100%|██████████| 3109/3109 [06:34<00:00, 15.64it/s]
100%|██████████| 3109/3109 [06:37<00:00,  7.82it/s]
100%|██████████| 3109/3109 [06:37<00:00,  7.82it/s]


Unnamed: 0,clean_title,clean_text
0,China_PROPN and_CCONJ Economic_PROPN Reform_PROPN :_PUNCT Xi_PROPN Jinping_PROPN 's_PART Track_PROPN Record_PROPN,Economists_NOUN generally_ADV agree_VERB :_PUNCT China_PROPN must_VERB overhaul_VERB its_ADJ huge_ADJ but_CCONJ wasteful_ADJ economy_NOUN if_ADP it_PRON wants_VERB to_PART continue_VERB to_PART gr...
1,Trade_NOUN to_PART Be_VERB a_DET Big_PROPN Topic_NOUN in_ADP Theresa_PROPN May_PROPN 's_PART U.S._PROPN Visit_NOUN,LONDON_PROPN —_NUM British_ADJ Prime_PROPN Minister_PROPN Theresa_PROPN May_PROPN said_VERB she_PRON 'll_VERB discuss_VERB trade_NOUN and_CCONJ security_NOUN in_ADP a_DET coming_VERB meeting_NOUN ...
2,"The_DET Top_PROPN Beaches_PROPN In_ADP The_DET World_PROPN ,_PUNCT According_VERB To_ADP National_PROPN Geographic_PROPN","Beaches_NOUN come_VERB in_ADP all_DET sorts_NOUN of_ADP shapes_NOUN and_CCONJ sizes_NOUN beyond_ADP the_DET typical_ADJ Caribbean_ADJ postcard_NOUN ._PUNCT As_ADP such_ADJ ,_PUNCT _SPACE National..."
3,"Sheriff_PROPN 's_PART Report_PROPN Provides_VERB New_PROPN Details_PROPN on_ADP Tamir_PROPN Rice_PROPN 's_PART Death_PROPN ,_PUNCT but_CCONJ Leaves_VERB Questions_NOUN","A_DET timeline_NOUN of_ADP what_NOUN happened_VERB after_ADP Tamir_PROPN Rice_PROPN ,_PUNCT a_DET 12-year_ADJ -_PUNCT old_ADJ boy_NOUN ,_PUNCT was_VERB killed_VERB by_ADP a_DET police_NOUN officer..."
4,Surgeon_NOUN claiming_VERB he_PRON will_VERB transplant_NOUN volunteer_NOUN 's_PART HEAD_NOUN to_ADP another_DET body_NOUN says_VERB he_PRON needs_VERB America_PROPN 's_PART help_NOUN to_PART do_V...,An_DET Italian_ADJ neurosurgeon_NOUN who_NOUN has_VERB claimed_VERB for_ADP months_NOUN that_ADP he_PRON will_VERB perform_VERB the_DET world_NOUN 's_PART first_ADJ human_ADJ head_NOUN transplant_...


In [33]:
X_valid_ftfy_spacy = pd.DataFrame()
X_valid_ftfy_spacy['clean_title'] = parallelize(X_valid_ftfy['clean_title'], parallel_tokenize)
X_valid_ftfy_spacy['clean_text'] = parallelize(X_valid_ftfy['clean_text'], parallel_tokenize)
X_valid_ftfy_spacy.head()

100%|██████████| 444/444 [00:20<00:00, 21.38it/s]
100%|██████████| 444/444 [00:21<00:00, 21.05it/s]
100%|██████████| 444/444 [00:21<00:00, 20.38it/s]
 90%|████████▉ | 398/444 [00:21<00:02, 21.45it/s]
100%|██████████| 444/444 [00:22<00:00, 20.03it/s]
100%|██████████| 444/444 [00:22<00:00, 19.51it/s]
100%|██████████| 444/444 [00:22<00:00, 19.52it/s]
100%|██████████| 444/444 [00:23<00:00, 19.22it/s]
100%|██████████| 444/444 [00:45<00:00,  9.83it/s]
100%|██████████| 444/444 [00:46<00:00,  9.64it/s]
 61%|██████    | 270/444 [00:46<00:21,  8.07it/s]
100%|██████████| 444/444 [01:04<00:00,  6.89it/s]
100%|██████████| 444/444 [01:04<00:00,  6.87it/s]
100%|██████████| 444/444 [01:04<00:00,  6.87it/s]
100%|██████████| 444/444 [01:04<00:00,  6.85it/s]
100%|██████████| 444/444 [01:04<00:00,  6.84it/s]


Unnamed: 0,clean_title,clean_text
0,Trump_PROPN says_VERB he_PRON is_VERB releasing_VERB something_NOUN '_PART phenomenal_ADJ in_ADP terms_NOUN of_ADP tax_NOUN '_PUNCT in_ADP 2_NUM to_PART 3_NUM weeks_NOUN,"Bob_PROPN Bryan_PROPN ,_PUNCT Business_PROPN Insider_PROPN 9.02.2017_NUM ,_PUNCT 16:25_NUM 182_NUM facebook_NOUN linkedin_NOUN twitter_NOUN email_NOUN print_NOUN ""_PUNCT We_PRON 're_VERB going_VER..."
1,Fidel_PROPN Castro_PROPN 's_PART ashes_NOUN make_VERB their_ADJ final_ADJ journey_NOUN across_ADP Cuba_PROPN,Cubans_PROPN have_VERB been_VERB lining_VERB the_DET streets_NOUN from_ADP Havana_PROPN to_ADP Santiago_PROPN to_PART watch_VERB Fidel_PROPN Castro_PROPN 's_PART ashes_NOUN make_VERB their_ADJ fin...
2,Obama_PROPN Administration_PROPN Sending_VERB $_SYM 500_NUM Million_NUM to_ADP Global_PROPN Climate_PROPN Change_PROPN Fund_PROPN,WASHINGTON_PROPN —_PUNCT The_DET Obama_PROPN administration_NOUN announced_VERB Tuesday_PROPN it_PRON is_VERB funneling_VERB $_SYM 500_NUM million_NUM to_ADP a_DET global_ADJ fund_NOUN to_PART add...
3,Insurers_NOUN Are_VERB Worried_ADJ About_ADP The_DET House_PROPN GOP_PROPN Health_PROPN Care_PROPN Bill_PROPN,The_DET main_ADJ industry_NOUN groups_NOUN representing_VERB health_NOUN insurance_NOUN companies_NOUN have_VERB broken_VERB their_ADJ silence_NOUN on_ADP the_DET Republican_ADJ bill_NOUN to_PART ...
4,Kobe_PROPN Bryant_PROPN and_CCONJ Nike_PROPN Form_PROPN Youth_PROPN Basketball_PROPN '_PART Mamba_PROPN League_PROPN '_PUNCT in_ADP Los_PROPN Angeles_PROPN,"A_DET year_NOUN after_ADP Kobe_PROPN Bryant_PROPN concluded_VERB his_ADJ NBA_PROPN career_NOUN ,_PUNCT Bryant_PROPN and_CCONJ Nike_PROPN have_VERB announced_VERB a_DET partnership_NOUN to_PART est..."


In [34]:
X_test_ftfy_spacy = pd.DataFrame()
X_test_ftfy_spacy['clean_title'] = parallelize(X_test_ftfy['clean_title'], parallel_tokenize)
X_test_ftfy_spacy['clean_text'] = parallelize(X_test_ftfy['clean_text'], parallel_tokenize)
X_test_ftfy_spacy.head()

100%|██████████| 706/706 [00:25<00:00, 27.50it/s]
 92%|█████████▏| 647/706 [00:25<00:01, 34.18it/s]
100%|██████████| 706/706 [00:25<00:00, 27.23it/s]
100%|██████████| 705/705 [00:26<00:00, 26.41it/s]
100%|██████████| 706/706 [00:26<00:00, 26.32it/s]
100%|██████████| 706/706 [00:27<00:00, 26.08it/s]
100%|██████████| 706/706 [00:27<00:00, 26.06it/s]
100%|██████████| 706/706 [00:27<00:00, 25.96it/s]
100%|██████████| 706/706 [01:33<00:00,  7.52it/s]
 96%|█████████▌| 678/706 [01:34<00:05,  4.75it/s]
100%|██████████| 706/706 [01:36<00:00,  7.30it/s]
100%|██████████| 706/706 [01:36<00:00,  7.29it/s]
100%|██████████| 706/706 [01:37<00:00,  7.26it/s]
100%|██████████| 706/706 [01:37<00:00,  7.25it/s]
100%|██████████| 705/705 [01:38<00:00,  7.16it/s]
100%|██████████| 706/706 [01:38<00:00, 32.95it/s]


Unnamed: 0,clean_title,clean_text
0,Amazon_PROPN CEO_PROPN Jeff_PROPN Bezos_PROPN is_VERB now_ADV the_DET second_ADJ richest_ADJ man_NOUN in_ADP the_DET world_NOUN,More_ADJ Try_VERB Yahoo_PROPN Finance_PROPN on_ADP Firefox_PROPN _SPACE »_PROPN Amazon_PROPN _SPACE CEO_PROPN Jeff_PROPN Bezos_PROPN is_VERB now_ADV the_DET second_ADJ richest_ADJ man_NOUN in_AD...
1,Does_VERB Laura_PROPN Dern_PROPN Handle_PROPN a_DET Lightsaber_PROPN in_ADP the_DET New_PROPN '_PART Star_PROPN Wars_PROPN '_PUNCT ?_PUNCT [_PUNCT Video_NOUN ]_PUNCT,More_ADJ Laura_PROPN Dern_PROPN seems_VERB to_PART be_VERB everywhere_ADV these_DET days_NOUN ._PUNCT That_DET 's_VERB because_ADP she_PRON is_VERB ._PUNCT She_PRON 's_VERB the_DET ferocious_ADJ R...
2,"In_ADP this_DET photographer_NOUN 's_PART home_NOUN town_NOUN ,_PUNCT stepping_VERB out_ADP of_ADP the_DET house_NOUN is_VERB a_DET risk_NOUN","Kirkuk_PROPN is_VERB a_DET city_NOUN of_ADP Northern_ADJ Iraq_PROPN in_ADP the_DET Kurdish_ADJ region_NOUN of_ADP the_DET country_NOUN ._PUNCT Arabs_PROPN ,_PUNCT Kurds_PROPN ,_PUNCT Turkmen_PROPN..."
3,"8_NUM Ways_PROPN To_PART Get_VERB Your_ADJ Spouse_NOUN To_PART Open_VERB Up_PART More_ADV ,_PUNCT According_VERB To_ADP Therapists_NOUN",Experts_NOUN say_VERB that_ADP communication_NOUN is_VERB the_DET cornerstone_NOUN of_ADP a_DET good_ADJ relationship_NOUN ._PUNCT That_DET 's_VERB why_ADV it_PRON can_VERB be_VERB deeply_ADV trou...
4,US_PROPN says_VERB claim_VERB it_PRON supported_VERB IS_VERB in_ADP Syria_PROPN is_VERB '_PUNCT ludicrous_ADJ '_PUNCT,Share_VERB this_DET with_ADP Email_PROPN Facebook_PROPN Messenger_PROPN Messenger_PROPN Twitter_PROPN Pinterest_PROPN WhatsApp_NOUN LinkedIn_PROPN Copy_VERB this_DET link_NOUN The_DET US_PROPN has...


**Save/Load Clean Data**

In [35]:
with open(DATA_PATH/'X_train_ftfy_spacy.pkl', 'wb') as  f:
    pickle.dump(X_train_ftfy_spacy, f)

with open(DATA_PATH/'X_valid_ftfy_spacy.pkl', 'wb') as  f:
    pickle.dump(X_valid_ftfy_spacy, f)
    
with open(DATA_PATH/'X_test_ftfy_spacy.pkl', 'wb') as  f:
    pickle.dump(X_test_ftfy_spacy, f)

In [24]:
((X_train_ftfy_spacy['clean_title'] != X_train_ftfy['clean_title']).sum(),
 (X_train_ftfy_spacy['clean_text'] != X_train_ftfy['clean_text']).sum(),
 (X_valid_ftfy_spacy['clean_title'] != X_valid_ftfy['clean_title']).sum(),
 (X_valid_ftfy_spacy['clean_text'] != X_valid_ftfy['clean_text']).sum(),
 (X_test_ftfy_spacy['clean_title'] != X_test_ftfy['clean_title']).sum(),
 (X_test_ftfy_spacy['clean_text'] != X_test_ftfy['clean_text']).sum())

(14901, 24589, 2078, 3506, 4202, 5598)

**Tokenize Data (nltk)**

In [238]:
tokenizer = TweetTokenizer(reduce_len=True)
tokenizer

<nltk.tokenize.casual.TweetTokenizer at 0x7f0dac2225c0>

In [239]:
def parallel_tokenize(s):
    return s.progress_apply(tokenize)

def tokenize(doc):
    return ' '.join(tokenizer.tokenize(doc))

In [240]:
tweet_ftfy_train_df = parallelize(ftfy_train_df, parallel_tokenize)
tweet_ftfy_train_df.head()

0              This is so cool . It's like , ' would you want your mother to read this ? ? ' Really great idea , well done !
1    Thank you ! ! This would make my life a lot less anxiety-inducing . Keep it up , and don't let anyone get in your way !
2                                  This is such an urgent design problem ; kudos to you for taking it on . Very impressive !
3                                     Is this something I'll be able to install on my site ? When will you be releasing it ?
4                                                                                      haha you guys are a bunch of losers .
Name: comment_text, dtype: object

In [241]:
tweet_ftfy_test_df = parallelize(ftfy_test_df, parallel_tokenize)
tweet_ftfy_test_df.head()

0                                              Jeff Sessions is another one of Trump's Orwellian choices . He believes and has believed his entire career the exact opposite of what the position requires .
1    I actually inspected the infrastructure on Grand Chief Stewart Philip's home Penticton First Nation in both 2010 and 2013 . Exactly Zero projects that had been identified in previous inspection re...
2    No it won't . That's just wishful thinking on democrats fault . For the 100 th time , Walker cited the cost of drug users treatment as being lost with Obamacare . I laugh every time I hear a liber...
3    Instead of wringing our hands and nibbling the periphery of the issue , how about we face the actual issue head on ? I would support a city ordinance against loitering , and applaud city councilor...
4    how many of you commenters have garbage piled high in your yard , bald tires , dead batteries , rotten pallets , car parts , blah blah blah . this town is a pigpen . drive aro

**Save/Load Clean Data**

In [243]:
with open(DATA_PATH/'tweet_ftfy_nfkc_train.pkl', 'wb') as  f:
    pickle.dump(tweet_ftfy_train_df, f)

with open(DATA_PATH/'tweet_ftfy_nfkc_test.pkl', 'wb') as  f:
    pickle.dump(tweet_ftfy_test_df, f)

In [244]:
with open(DATA_PATH/'tweet_ftfy_nfkc_train.pkl', 'rb') as  f:
    tweet_ftfy_train_df = pickle.load(f)

with open(DATA_PATH/'tweet_ftfy_nfkc_test.pkl', 'rb') as  f:
    tweet_ftfy_test_df = pickle.load(f)

In [247]:
check_df = tweet_ftfy_train_df.str.split()
check_df.head()

0              [This, is, so, cool, ., It's, like, ,, ', would, you, want, your, mother, to, read, this, ?, ?, ', Really, great, idea, ,, well, done, !]
1    [Thank, you, !, !, This, would, make, my, life, a, lot, less, anxiety-inducing, ., Keep, it, up, ,, and, don't, let, anyone, get, in, your, way, !]
2                                          [This, is, such, an, urgent, design, problem, ;, kudos, to, you, for, taking, it, on, ., Very, impressive, !]
3                                             [Is, this, something, I'll, be, able, to, install, on, my, site, ?, When, will, you, be, releasing, it, ?]
4                                                                                                        [haha, you, guys, are, a, bunch, of, losers, .]
Name: comment_text, dtype: object

**Keyword Extraction**

In [224]:
text_content = " Jeff Sessions is another one of Trump 's Orwellian choices . He believes and has believed his entire career the exact opposite of what the position requires ."
simple_kwextractor = yake.KeywordExtractor(n=2)
keywords = simple_kwextractor.extract_keywords(text_content)

In [225]:
keywords

[('jeff sessions', 0.008293207122436794),
 ('orwellian choices', 0.008293207122436794),
 ('position requires', 0.015380821171891606),
 ('entire career', 0.02570861714399338),
 ('exact opposite', 0.02570861714399338),
 ('trump', 0.057012387690331526),
 ('sessions', 0.08596317751626563),
 ('orwellian', 0.08596317751626563),
 ('jeff', 0.09568045026443411),
 ('choices', 0.09568045026443411),
 ('requires', 0.09568045026443411),
 ('believed', 0.15831692877998726),
 ('entire', 0.15831692877998726),
 ('career', 0.15831692877998726),
 ('exact', 0.15831692877998726),
 ('opposite', 0.15831692877998726),
 ('position', 0.15831692877998726)]