# Preprocess using gensim simple_preprocess

Generate 3 files from news2.7m:
- news2.7m-gensim-titles.csv
- news2.7m-gensim-articles.csv
- news2.7m-gensim-metadata.csv

Process:
1. Drop bad row (1)
2. Drop null titles (37)
3. We will have 2688841 rows
4. Tokenize titles and save csv
5. Drop null articles and we will have 2584149 rows
6. Tokenize articles and save csv
7. Redo 1 and 2, drop `title` and `article` columns
8. Augment with time columns and save metadata csv

In [1]:
import pandas as pd
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
import sys
sys.path.append('../')
from utils.tomo import *
pd.set_option('display.max_rows', 250)
pd.set_option('display.max_colwidth', 140)

## Read source

In [2]:
df = pd.read_csv('all-the-news-2-1.csv', low_memory=False)

In [3]:
len(df)

2688879

In [4]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'date', 'year', 'month', 'day', 'author',
       'title', 'article', 'url', 'section', 'publication'],
      dtype='object')

In [5]:
df['Unnamed: 0'].value_counts()

2424619    12577
0              1
1784206        1
1784199        1
1784200        1
           ...  
892101         1
892102         1
892103         1
892104         1
2790033        1
Name: Unnamed: 0, Length: 2676303, dtype: int64

In [6]:
df['Unnamed: 0.1'].value_counts()

0          1
1792580    1
1792582    1
1792583    1
1792584    1
          ..
896293     1
896294     1
896295     1
896296     1
2789273    1
Name: Unnamed: 0.1, Length: 2688879, dtype: int64

## Drop unused column

In [7]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
df.rename(columns={'Unnamed: 0.1': 'id'}, inplace=True)

## Make an id column

In [9]:
df['nan'] = pd.to_numeric(df.id, errors='coerce').isnull()

In [10]:
df.loc[df.nan==True]

Unnamed: 0,id,date,year,month,day,author,title,article,url,section,publication,nan


In [11]:
df.drop(df.loc[df.nan==True].index, inplace=True)

In [12]:
len(df)

2688879

In [13]:
df.drop('nan', axis=1, inplace=True)

## Drop null dated row

In [14]:
df[df.date.isnull()]

Unnamed: 0,id,date,year,month,day,author,title,article,url,section,publication
2324811,2424832,,https://www.washingtonpost.com/outlook/tale-of-a-woman-who-died-and-a-woman-who-killed-in-the-northern-ireland-conflict/2019/03/08/59e75...,,Washington Post,,,,,,


In [15]:
df.drop(df.loc[df.date.isnull()].index, inplace=True)

In [16]:
len(df)

2688878

## How many nulls?

In [17]:
df.isnull().value_counts()

id     date   year   month  day    author  title  article  url    section  publication
False  False  False  False  False  False   False  False    False  False    False          946517
                                   True    False  False    False  False    False          806893
                                   False   False  False    False  True     False          713027
                                   True    False  False    False  True     False          117712
                                                  True     False  True     False           64961
                                                                  False    False           18923
                                                           True   True     True            12577
                                   False   False  True     False  False    False            4267
                                                                  True     False            3964
                                   True 

## Drop null titles

In [18]:
len(df.loc[df.title.isnull()])

37

In [19]:
df.drop(df.loc[df.title.isnull()].index, inplace=True)

In [20]:
len(df)

2688841

In [21]:
df.isnull().value_counts()

id     date   year   month  day    author  title  article  url    section  publication
False  False  False  False  False  False   False  False    False  False    False          946517
                                   True    False  False    False  False    False          806893
                                   False   False  False    False  True     False          713027
                                   True    False  False    False  True     False          117712
                                                  True     False  True     False           64961
                                                                  False    False           18923
                                                           True   True     True            12577
                                   False   False  True     False  False    False            4267
                                                                  True     False            3964
dtype: int64

## Test gensim preprocess with 5 rows

In [22]:
docs = df.title.head().tolist()
[ ' '.join(simple_preprocess(strip_tags(doc), deacc=True)) for doc in docs]

['we should take concerns about the health of liberal democracy seriously',
 'colts gm ryan grigson says andrew luck contract makes it difficult to build the team',
 'trump denies report he ordered mueller fired',
 'france sarkozy reveals his passions but insists no come back on cards',
 'paris hilton woman in black for uncle monty funeral']

## Prepare titles csv with only 3 columns [`id`, `title_clean`, `num_words_title`]

### Remove duplicate titles

In [62]:
dups = df[~df.title.duplicated() & df.title.duplicated(keep=False)]
len(dups)

138957

In [64]:
nondups = df[~df.title.duplicated(keep=False)]
len(nondups)

2342305

In [66]:
titles = pd.concat([dups, nondups])[['id', 'title']]

In [67]:
titles.title.value_counts()

France's Sarkozy reveals his 'Passions' but insists no come-back on cards                        1
Trump closing strong | TheHill                                                                   1
Poll: Half think Clinton lying about health | TheHill                                            1
Trump campaign left out of Alaska voter guide | TheHill                                          1
Pope Francis says he won't try to alter Trump's stances on environment, immigration | TheHill    1
                                                                                                ..
RBS may use its surplus capital to pay one-off dividend -the Times                               1
Katy Perry Has More to Say About Taylor Swift                                                    1
Dancing with the Stars Finale: All About Ginger Zee, Paige VanZant, Nyle DiMarco                 1
Jacob Tremblay spent a day at work with his hot detective dad                                    1
Ruth Bader

### Make `title_clean`

In [69]:
titles['title_clean'] = [ ' '.join(simple_preprocess(strip_tags(doc), deacc=True)) for doc in titles.title.tolist() ]

In [70]:
titles.head(10)

Unnamed: 0,id,title,title_clean
3,3,France's Sarkozy reveals his 'Passions' but insists no come-back on cards,france sarkozy reveals his passions but insists no come back on cards
11,11,Hudson's Bay's chairman's buyout bid pits retail versus real estate,hudson bay chairman buyout bid pits retail versus real estate
14,14,UK PM May presses on with bid to get Brexit deal through parliament: spokesman,uk pm may presses on with bid to get brexit deal through parliament spokesman
20,20,Trump warned NATO allies U.S. would go it alone if they did not spend: sources,trump warned nato allies would go it alone if they did not spend sources
22,22,Wells Fargo to pay $575 million in settlement with U.S. states,wells fargo to pay million in settlement with states
27,27,Exclusive: Britain's financial heartland unbowed as Brexit risks deepen,exclusive britain financial heartland unbowed as brexit risks deepen
33,33,"U.S. June sales a mixed bag for automakers; SUVs, trucks still strong",june sales mixed bag for automakers suvs trucks still strong
37,37,FCC probes whether Sinclair misled agency during failed Tribune deal,fcc probes whether sinclair misled agency during failed tribune deal
48,48,"IEA concerned about Middle East tensions, stands ready to act",iea concerned about middle east tensions stands ready to act
52,52,U.S. lawmakers ask for disclosure of number of Americans under surveillance,lawmakers ask for disclosure of number of americans under surveillance


### Drop original `title` column

In [72]:
titles.drop('title', axis=1, inplace=True)

In [73]:
titles.head(10)

Unnamed: 0,id,title_clean
3,3,france sarkozy reveals his passions but insists no come back on cards
11,11,hudson bay chairman buyout bid pits retail versus real estate
14,14,uk pm may presses on with bid to get brexit deal through parliament spokesman
20,20,trump warned nato allies would go it alone if they did not spend sources
22,22,wells fargo to pay million in settlement with states
27,27,exclusive britain financial heartland unbowed as brexit risks deepen
33,33,june sales mixed bag for automakers suvs trucks still strong
37,37,fcc probes whether sinclair misled agency during failed tribune deal
48,48,iea concerned about middle east tensions stands ready to act
52,52,lawmakers ask for disclosure of number of americans under surveillance


### Add `num_words_title`

In [102]:
titles['num_words_title'] = titles.apply(lambda x: len(x.title_clean.split(' ')), axis=1)

In [103]:
titles.head()

Unnamed: 0,id,title_clean,num_words_title
3,3,france sarkozy reveals his passions but insists no come back on cards,12
11,11,hudson bay chairman buyout bid pits retail versus real estate,10
14,14,uk pm may presses on with bid to get brexit deal through parliament spokesman,14
20,20,trump warned nato allies would go it alone if they did not spend sources,14
22,22,wells fargo to pay million in settlement with states,9


### Save to file

In [104]:
titles.to_csv('news2.7m-gensim-titles.csv', index=False)

## Prepare articles csv with only 3 columns [`id`, `article`, `nun_words_article`]

### Remove duplicate articles

In [22]:
dups = df[~df.article.duplicated() & df.article.duplicated(keep=False)]
len(dups)

90235

In [23]:
nondups = df[~df.article.duplicated(keep=False)]
len(nondups)

2385284

In [24]:
articles = pd.concat([dups, nondups])[['id', 'article']]

In [25]:
articles.article.value_counts()[0]

1

In [26]:
len(articles)

2475519

### Drop null article

In [28]:
articles[articles.apply(lambda x: type(x.article)!=str, axis=1)]

Unnamed: 0,id,article
471,471,


In [27]:
articles.drop(articles.loc[articles.article.isnull()].index, inplace=True)
len(articles)

2475518

### Make `article_clean`

In [28]:
import gc
del df
gc.collect()

0

In [34]:
articles['article_clean'] = [ ' '.join(simple_preprocess(strip_tags(doc), deacc=True)) for doc in articles.article.tolist() ]

### Drop original `article` column

In [35]:
articles = articles.drop('article', axis=1)

In [36]:
articles.head(10)

Unnamed: 0,id,article_clean
3,3,paris reuters former french president nicolas sarkozy published new memoir on thursday but was quick to dismiss speculation he might ret...
11,11,reuters the success of hudson bay co executive chairman richard baker billion bid to take the department store operator private hinges o...
33,33,reuters major automakers on tuesday posted mixed sales results for june and the second quarter with demand still fairly strong for suvs ...
37,37,washington reuters shares in sinclair broadcast group inc fell by after the federal communications commission disclosed it has opened an...
48,48,dublin reuters the international energy agency iea is very concerned about the impact that tensions in the middle east may have on globa...
55,55,winnipeg manitoba calgary alberta june reuters ears about oil spills into the great lakes from two aging pipelines have flared raising d...
56,56,reuters wpp wpp is in exclusive talks to sell majority stake in its data analytics unit kantar to private equity firm bain capital it sa...
59,59,budapest reuters hungary has no evidence that equipment from chinese telecoms giant huawei poses security threat government minister sai...
67,67,warsaw reuters an examiner was run over and killed by year old woman taking her driving test in the southern polish city of rybnik on mo...
73,73,brussels reuters nato and russia did not make significant progress on saving the intermediate range nuclear forces treaty inf in talks a...


### Add `num_words_article`

In [37]:
articles['num_words_article'] = articles.apply(lambda x: len(x.article_clean.split(' ')), axis=1)

In [38]:
articles.head(10)

Unnamed: 0,id,article_clean,num_words_article
3,3,paris reuters former french president nicolas sarkozy published new memoir on thursday but was quick to dismiss speculation he might ret...,364
11,11,reuters the success of hudson bay co executive chairman richard baker billion bid to take the department store operator private hinges o...,827
33,33,reuters major automakers on tuesday posted mixed sales results for june and the second quarter with demand still fairly strong for suvs ...,563
37,37,washington reuters shares in sinclair broadcast group inc fell by after the federal communications commission disclosed it has opened an...,428
48,48,dublin reuters the international energy agency iea is very concerned about the impact that tensions in the middle east may have on globa...,322
55,55,winnipeg manitoba calgary alberta june reuters ears about oil spills into the great lakes from two aging pipelines have flared raising d...,616
56,56,reuters wpp wpp is in exclusive talks to sell majority stake in its data analytics unit kantar to private equity firm bain capital it sa...,543
59,59,budapest reuters hungary has no evidence that equipment from chinese telecoms giant huawei poses security threat government minister sai...,567
67,67,warsaw reuters an examiner was run over and killed by year old woman taking her driving test in the southern polish city of rybnik on mo...,173
73,73,brussels reuters nato and russia did not make significant progress on saving the intermediate range nuclear forces treaty inf in talks a...,239


### Save to file

In [39]:
articles.to_csv('news2.7m-gensim-articles.csv', index=False)

## Prepare autsecpub csv with 4 columns

### Remove duplicated urls

In [26]:
dups = df[~df.url.duplicated() & df.url.duplicated(keep=False)]
len(dups)

6

In [27]:
nondups = df[~df.url.duplicated(keep=False)]
len(nondups)

2676254

In [28]:
autsecpub = pd.concat([dups, nondups])[['id', 'author', 'section', 'publication']]

In [29]:
len(autsecpub)

2676260

### Make `author_clean` and `section_clean` columns

In [32]:
autsecpub = autsecpub.fillna('')

In [33]:
autsecpub['author_clean'] = [ ' '.join(simple_preprocess(strip_tags(doc), deacc=True)) for doc in autsecpub.author.tolist() ]

In [34]:
autsecpub['section_clean'] = [ ' '.join(simple_preprocess(strip_tags(doc), deacc=True)) for doc in autsecpub.section.tolist() ]

In [45]:
autsecpub['publication_clean'] = [ ' '.join(simple_preprocess(strip_tags(doc), deacc=True)) for doc in autsecpub.publication.tolist() ]

### Drop the original `author` and `section` columns

In [35]:
autsecpub.drop(['author', 'section', 'publication'], axis=1, inplace=True)

In [47]:
len(autsecpub)

2676260

In [48]:
autsecpub.head()

Unnamed: 0,id,author_clean,section_clean,publication_clean
4811,4811,,,tmz
49986,49986,nacho doce,world news,reuters
51181,51181,,politics,reuters
62583,62583,,,tmz
848277,848277,gizmodo staff,,gizmodo


### Save to file

In [49]:
autsecpub.to_csv('news2.7m-gensim-autsecpub.csv', index=False)

## Prepare metadata csv with time columns

### Drop columns that were processed earlier

In [39]:
df.columns

Index(['id', 'date', 'year', 'month', 'day', 'author', 'title', 'article',
       'url', 'section', 'publication'],
      dtype='object')

In [50]:
df.drop(['author', 'title', 'article', 'section', 'publication'], axis=1, inplace=True)

In [51]:
len(df)

2688841

In [52]:
df.head()

Unnamed: 0,id,date,year,month,day,url
0,0,2016-12-09 18:31:00,2016,12.0,9,https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs
1,1,2016-10-07 21:26:46,2016,10.0,7,https://www.businessinsider.com/colts-gm-ryan-grigson-andrew-luck-contract-2016-10
2,2,2018-01-26 00:00:00,2018,1.0,26,https://www.reuters.com/article/us-davos-meeting-trump-mueller/trump-denies-report-he-ordered-mueller-fired-idUSKBN1FF12A
3,3,2019-06-27 00:00:00,2019,6.0,27,https://www.reuters.com/article/france-politics-sarkozy/frances-sarkozy-reveals-his-passions-but-insists-no-come-back-on-cards-idUSL8N23...
4,4,2016-01-27 00:00:00,2016,1.0,27,https://www.tmz.com/2016/01/27/paris-hilton-monty-brinson-funeral/


In [53]:
df = augment_with_time_columns(df)

In [54]:
df.head()

Unnamed: 0,id,date,year,month,day,url,dayofweek,isweekend,weeknum,season,yearseason,quarter,yearquarter,yearmonth
0,0,2016-12-09 18:31:00,2016,12,9,https://www.vox.com/polyarchy/2016/12/9/13898340/democracy-warning-signs,5,False,49,fall,2016-2-fall,4,2016-4,2016-12
1,1,2016-10-07 21:26:46,2016,10,7,https://www.businessinsider.com/colts-gm-ryan-grigson-andrew-luck-contract-2016-10,5,False,40,fall,2016-2-fall,4,2016-4,2016-10
2,2,2018-01-26 00:00:00,2018,1,26,https://www.reuters.com/article/us-davos-meeting-trump-mueller/trump-denies-report-he-ordered-mueller-fired-idUSKBN1FF12A,5,False,3,winter,2017-3-winter,1,2018-1,2018-01
3,3,2019-06-27 00:00:00,2019,6,27,https://www.reuters.com/article/france-politics-sarkozy/frances-sarkozy-reveals-his-passions-but-insists-no-come-back-on-cards-idUSL8N23...,4,False,25,summer,2019-1-summer,2,2019-2,2019-06
4,4,2016-01-27 00:00:00,2016,1,27,https://www.tmz.com/2016/01/27/paris-hilton-monty-brinson-funeral/,3,False,4,winter,2015-3-winter,1,2016-1,2016-01


In [55]:
df.to_csv('news2.7m-gensim-metadata.csv', index=False)