# Cleaning Raw Text Data for Extracting SVO and Word2Vec



## Step1. Merging the file

In [1]:
import pandas as pd
import numpy as np
import os
dat_path = '/home/junhyuki/DLproject/DAT'
year = 2012

print('merging ' + str(year) + ' news articles')
os.chdir(dat_path + '/daily_news/{}'.format(year))
file_list = sorted(os.listdir())
# merging as a one dataframe
usecols = ['connected_url', 'content', 'firstline', 'keywords', 'timestamp', 'title']
df = pd.read_csv(file_list[0], usecols = usecols)
for file in file_list[1:]:
    tmp = pd.read_csv(file)
    df = df.append(tmp, ignore_index=True)
print(str(year) + ' : ' + str(df.shape))

merging 2012 news articles
2012 : (316301, 6)


In [2]:
df.head()

Unnamed: 0,connected_url,content,firstline,keywords,timestamp,title
0,http://www.reuters.com/article/2012/01/02/us-f...,"OAKLAND, Calif (Reuters) - A football fan fell...","OAKLAND, Calif (Reuters) - A football fan fell...",Juan Salceto;Stephen Glatstein;US;FOOTBALL;ACC...,20120101,Oakland Raiders fan falls from bleachers after...
1,http://www.reuters.com/article/2012/01/02/us-n...,NEW YORK (Reuters) - St. Louis Blues defensema...,NEW YORK (Reuters) - St. Louis Blues defensema...,Ian Cole;Justin Abdelkader;US;NHL;BLUES;COLE;N...,20120101,Blues' Cole suspended for three games
2,http://www.reuters.com/article/2012/01/02/us-n...,(Reuters) - Sergei Kostitsyn completed a hat-t...,(Reuters) - Sergei Kostitsyn completed a hat-t...,Barry Trotz;Brendan Morrison;Jarome Iginla;Kev...,20120101,Kostitsyn's hat-trick helps Predators douse Fl...
3,http://www.reuters.com/article/2012/01/02/us-n...,(Reuters) - The slumping Denver Broncos won th...,(Reuters) - The slumping Denver Broncos won th...,Mark Leffingwell;Tim Tebow;US;NFL;BRONCOS;Spor...,20120101,"Broncos clinch AFC West, snap playoff drought"
4,http://www.reuters.com/article/2012/01/02/us-c...,HAVANA (Reuters) - Pope Benedict XVI will visi...,HAVANA (Reuters) - Pope Benedict XVI will visi...,Cuba;Mexico;Benedict XVI;Fidel Castro;Raul Cas...,20120101,Pope Benedict XVI to visit Cuba March 26-28


## Step2. Data Preprocessing

### 1) Delete junk words not explaining the titles

there are many junk words in the titles.
Below are what we found by random sampling. 

- REFILE-UPDATE 2-U.S. seeks enhanced financial authority for Fed
- Mariah Carey's "E=MC2" offers genre-crossing equation
- Cell Therapeutics, Inc. Announces Filing of Form 10-K
- Northwest gives Delta merger nod: source
- Chefs warn on side-effects of sushi boom
- UK seaside town lifts 44-year Rolling Stones ban
- Sniper fire and Middle East confusion: Bernd Debusmann
- Sempra Calif. Palomar natgas unit off-line planned
- Ex-NY Gov. Spitzer dodged DA probe by resigning
- Metro-Goldwyn agrees entertainment projects in UAE
- Nokia talks to major labels about music service: report
- U.S. states urge FCC constraints on XM-Sirius deal
- FACTBOX: Balkan candidates offer NATO leaner military muscle
- Atkins (WS) - Transaction in Own Shares
- Third point Offshore - Net Asset Value(s)
- Photo Release -- Michael Zemba Named President of U-Haul Company of Harrisburg
- REG-TOYOTA MOTOR CREDIT CORPORATION Full Redemption
- Cello Group plc - Additional listing / TVR
- Nokia talks to major labels about music service: report
- REG-Invesco Eng.&Intnl;: Net Asset Value(s) 
- RPT-China opens iron ore market to the world in pricing, image push
- TEXT-Wesfarmers says chairman to stand down
- PRESS DIGEST - Malaysia - July 3
- UPDATE 1-Maritime Capital bins 300 mln Singapore IPO plan
- PREVIEW-G8 could see climate deal but substance in doubt


...

In [3]:
import re
def dash_preprocess(title):    
    """ if there is '-' in sentences,
    i) no white space between two words: meaningful
    ii) white space between two words: useless
    iii) upper case all for first word: useless
    """
    # ii) white space btw two words
    if re.compile('\s+-+\s+').findall(title) != []:
        p = re.compile('\s+-+\s+')
        m = p.search(title)
        try:
            left_side = title[:m.span()[0]]
            right_side = title[m.span()[0]+3:]
                
            if len(left_side) <= len(right_side):
                title = right_side
            else:
                title = left_side
        except:
            pass
    elif re.compile('[-]').findall(title) != []:
        for i in range(3):
            p = re.compile('[-]')
            m = p.search(title)
            try:
                left_side = title[:m.span()[0]]
                right_side = title[m.span()[0]+1:]     
                if len(left_side) <= len(right_side):
                    if left_side.split()[0].isupper():
                        title = right_side
                    else:
                        pass
                else:
                    if right_side.split()[0].isupper():
                        title = left_side
                    else:
                        title
            except:
                pass
    else:
        pass
    return title.strip()

def colon_preprocess(title):
    """ always delete the shorter side"""
    if re.compile('[:]').findall(title) != []:
        p = re.compile('[:]')
        m = p.finditer(title)
        for position in m:
            left_side = title[:position.span()[0]]
            right_side = title[position.span()[0]+1:]
            
            if len(left_side) <= len(right_side):
                title = right_side
            else:
                title = left_side
    else:
        pass
    return title.strip()

In [4]:
df['clean_title'] = df['title'].apply(lambda x: dash_preprocess(x))
df['clean_title'] = df['clean_title'].apply(lambda x: colon_preprocess(x))

In [5]:
idx = np.random.choice(len(df), 10)
df[['title', 'clean_title']].iloc[idx]

Unnamed: 0,title,clean_title
162324,"Russia Globaltrans raises $520 mln, cites high...","Russia Globaltrans raises $520 mln, cites high..."
114591,Yahoo shares climb on report Alibaba deal near,Yahoo shares climb on report Alibaba deal near
178589,Ticket scandal overshadows Games opening,Ticket scandal overshadows Games opening
166882,BlackRock to address ETF market share losses,BlackRock to address ETF market share losses
301911,Top free-agent pitcher Greinke signs with Dodgers,Top free-agent pitcher Greinke signs with Dodgers
234385,Exclusive: Legg Mason hires Korn/Ferry to run ...,Legg Mason hires Korn/Ferry to run CEO search
571,Same-store sales seen up 3.4 percent in December,Same-store sales seen up 3.4 percent in December
146924,UPDATE 1-Omeros announces public offering of c...,Omeros announces public offering of common stock
137791,UPDATE 2-World on red alert for Greek vote,World on red alert for Greek vote
209218,"Albright, Suu Kyi sons among 2,000 removed fro...","Albright, Suu Kyi sons among 2,000 removed fro..."


### 2) Delete junk words in firstline and contents

In the main contents of articles, the format usually follow below...

http://www.reuters.com/article/2012/01/03/us-samsung-plant-idUSTRE8021X520120103
- **SEOUL, Jan 4 (Reuters) -** South Korea’s LG Electronics Inc aims to grow revenue from its air conditioning business by more than 10 percent this year, Nho Hwanyong, the head of the business, said on Wednesday. LG, the world’s top air conditioning maker, said it planned to focus on emerging markets as weak economies and housing markets in Europe and the United States were likely to sap demand there. Nho’s remarks were made at a media event to introduce new models for 2012. **(Reporting by Miyoung Kim; Editing by Jonathan Hopfner)Our Standards:The Thomson Reuters Trust Principles.**
<br> 

http://www.reuters.com/article/2012/01/04/hongkongpress-idUSL3E8C40D420120104
- **HONG KONG, Jan 4 (Reuters) -** These are some of the leading stories in Hong Kong newspapers on Wednesday. Reuters has not verified these stories and does not vouch for their accuracy. SOUTH CHINA MORNING POST — Hong Kong’s Chief Executive Donald Tsang said in a commercial radio interview on Tuesday that his government will not lower the tax rate on corporate profits, despite a double-digit rise in tax revenue last year. — A residential and commercial site in Tseung Kwan O, New Territories, is open for bidding until Jan. 6, and surveyors estimate it will sell for up to HK\$1.85 billion (\$238.15 million) in the tender. The site can yield 488,180 square feet of gross floor space. HONG KONG ECONOMIC TIMES — New World Development Co Ltd Chairman Cheng Yu-tung raised his stake in Silver Base Group Holdings , an operator of alcoholic beverages in the mainland, on Dec. 29, buying an additional 1.62 million shares for about HK\$10.175 million, according to an exchange disclosure. THE STANDARD — Internationalising the yuan is like raising a child, and Hong Kong is its nursery, the chief executive of the city’s stock exchange Charles Li said on his blog. The local bourse will be proactive in promoting the development of yuan interest rate, exchange rate and derivative products, he added. — Dalian-based developer Kai Shi China Holdings, which aims to raise up to HK\$180 million by selling 150 million shares on the local bourse, said it plans to use 80 percent of the proceeds from its initial public offering to replenish the firm’s land bank this year. ORIENTAL DAILY — Cheung Kong (Holdings) Ltd Chairman Li Ka-shing bought an additional 330,000 shares of the company at an average of HK\$92.423 each for about HK30.5 million on Dec. 29, the company said in a disclosure to the stock exchange. For Chinese newspapers, see............... For Taiwan newspapers, see................  (\$1 = 7.7681 Hong Kong dollars)	 	  **(Reporting by Twinnie Siu; Editing by Jacqueline Wong)Our Standards:The Thomson Reuters Trust Principles.**
<br> 

http://www.reuters.com/article/2012/01/04/us-crime-couple-idUSTRE80300Q20120104
- **LAS VEGAS (Reuters) -** A couple wanted for a crime spree through Utah and Nevada that saw an elderly pair slain in their home and a woman shot in the head was arrested on Tuesday wandering in the desert, authorities said. Logan McFarland is seen in police booking photos provided by the Sanpete County Sheriffs department in Manti, Utah, January 3, 2012. A couple wanted for a crime spree through Utah and Nevada that saw an elderly pair slain in their home and a woman shot in the head was arrested on Tuesday wandering in the desert, authorities said. McFarland, 24, and Angela Marie Hill, 25, were spotted by the pilot of a sheriff's plane in a remote area of Elko County, in northern Nevada, Supervisory Deputy U.S. Marshal Jim Phelps told Reuters. REUTERS/U.S. Marshals Service/HandoutSuspects Logan McFarland, 24, and Angela Marie Hill, 25, were spotted by the pilot of a sheriff’s plane in a remote area of Elko County in northern Nevada, Supervisory Deputy U.S. Marshal Jim Phelps told Reuters. The duo were walking in the desert after they evaded highway patrol officers on Saturday in a car chase and later crashed their stolen Volkswagen Jetta, he said. Once the couple was spotted from the air on Tuesday, officers on the ground moved in to make the arrest. Angela Marie Hill is seen in a police booking photo provided by the Sanpete County Sheriffs department in Manti, Utah, January 3, 2012. A couple wanted for a crime spree through Utah and Nevada that saw an elderly pair slain in their home and a woman shot in the head was arrested on Tuesday wandering in the desert, authorities said. Hill, 25, and Logan McFarland, 24, were spotted by the pilot of a sheriff's plane in a remote area of Elko County, in northern Nevada, Supervisory Deputy U.S. Marshal Jim Phelps told Reuters. REUTERS/U.S. Marshals Service/Handout“They gave up without incident because they were dehydrated and very tired,” Phelps said. “We positively identified them by their scars and tattoos, so we’re 100 percent confident that we have the suspects.” The U.S. Marshals Service said the couple began a crime spree on Thursday when they killed an elderly couple at their home in Mount Pleasant, Utah, 85 miles south of Salt Lake City. “We believe the motivation there was robbery,” Phelps said, adding that the couple stole a car and later drove to northern Nevada, where they tried to carjack a woman in West Wendover, Phelps said. The woman was shot in the head but managed to drive away, Phelps said. That victim has been treated at a hospital and was expected to survive, he said. The couple later managed to steal a Volkswagen Jetta in Wells, Nevada, on Saturday, where they also got into the car chase with highway patrol officers that led to their flight into the desert on foot, Phelps said. McFarland and Hill were being held on suspicion of kidnapping and attempted homicide in connection with the attempted carjacking in Nevada, and were expected to face additional charges as well, he said. **Reporting by Alex Dobuzinskis: Editing by Cynthia JohnstonOur Standards:The Thomson Reuters Trust Principles.**
<br>

http://www.reuters.com/article/2012/01/03/samsung-plant-idUSL3E8C37X820120103
- (Adds details) \* New plant will have monthly production capacity of 100,000 wafers \* Samsung has yet to decide investment size, venue \* Flash market seen at \$29 bln in 2012, China to take 50 pct SEOUL, Jan 4 (Reuters) - South Korea said on Wednesday it had approved a plan by Samsung Electronics Co to build a flash memory chip plant in China seen costing some \$4 billion, as a boom in smartphones and tablet computers fuels the chip industry’s growth. The plant would be Samsung’s second overseas chip manufacturing site and reflects the growing importance of the Chinese market. China’s consumption of NAND-type flash memory chips is set to account for half the global flash market, estimated at \$29 billion this year, and its portion will rise to 55 percent by 2015, according to research firm Gartner. The stellar growth is being driven by Chinese manufacturers such as Huawei Technologies and ZTE Corp  , which have been steadily raising their global smartphone and tablet market share. The new production line by Samsung will use cutting-edge 20-nanometer-class or below processing technology and mass production is planned to start from late 2013. Samsung has yet to decide on a site for the plant, which will have a capacity of 100,000 wafers per month. South Korea requires local firms to apply to build foreign production bases for fear of leakages of the country’s prized high technology. The Ministry of Knowledge and Economy said in a statement that Samsung would set up a committee to prevent potential technology leaks. Samsung is the world’s biggest NAND flash memory maker with around 40 percent of the market. It competes with Japan’s Toshiba Corp, Hynix Semiconductor Inc of Korea and Micron Technology Inc of the United States.	 	  (Reporting by Miyoung Kim; Editing by Jonathan Hopfner)Our Standards:The Thomson Reuters Trust Principles.  
<br>

- Feb 16 (Reuters) Moody’s Investors Service has today affirmed the Ba1 corporate family rating (CFR) of Portugal Telecom SGPS, SA. Reporting by Wayne ColeOur Standards:The Thomson Reuters Trust Principles.

...

There are **some patterns** in contents

**First of all**, for the first sentence or after, the left side of first **'-'** should be eliminated. But I think that the left side contains somewhat important location information like 'SEOUL' before **`(Reuter)`**. So after eliminating it from contents, i saved location info in a new column.

> How can we catch the location?

After finding the first **`(Reuter)`** and then we will find first to fourth (because of long location name) **uppper case** words and then save it.


**Secondly**, for the last sentence, there is a reporter name and some useless information for explaining the articles. Hence, in a safeway,
1. eliminate last sentence
2. Since there are more useless sentences in the last, i will discriminate them by using

**In third**, if there are no **`(Reuter)`** in the article, i assume that this article is not important(i.e. contents' structure is ugly, or it is not a standard news type, or some brief info, etc) for our modeling

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

def clean_content(text):
    """ Cleaning Reuters' contents of articles """
    p = re.compile('(\(Reuters\))')
    m = p.search(text)
    if str(type(m)) == "<class 'NoneType'>":
        text = 'nan'
    else:
        dash_text = text[m.span()[1]:m.span()[1]+4]
        p2 = re.compile(' - ')
        if str(type(p2.search(dash_text))) == "<class 'NoneType'>":
            text = text[m.span()[1]:].strip()
        else:
            m = p2.search(text)
            right_side = text[m.span()[1]:].strip()
            # remove last sentence
            tmp = sent_tokenize(right_side)        
            text = ' '.join(tmp[:len(tmp)-1])
    
    if text == []:
        text = 'nan'
    return text.strip()

def clean_firstline(text):
    """ Cleaning firstline of articles """
    p = re.compile('(\(Reuters\))')
    try:
        p.search(text).span()
    except:
        text = 'nan'
    else:
        m = p.search(text)
        dash_text = text[m.span()[1]:m.span()[1]+4]
        p2 = re.compile(' - ')
        if str(type(p2.search(dash_text))) == "<class 'NoneType'>":
            text = text[m.span()[1]:].strip()
        else:
            m = p2.search(text)
            right_side = text[m.span()[1]:].strip()
            text = right_side
    if text == []:
        text = 'nan'
    return text.strip()

def extract_location(text):
    """ Cleaning Reuters' contents of articles """
    p = re.compile('(\(Reuters\))')
    m = p.search(text)
    if str(type(m)) == "<class 'NoneType'>":
        location = 'nan'
    else:
        try:
            left_side = text[m.span()[0]-20:m.span()[0]]
            tmp = sent_tokenize(left_side)
            left_side = tmp[len(tmp)-1]
            p = re.compile('[A-Z]{2,}')
            if p.findall(left_side) != []:
            # extract location information
                location = ' '.join(p.findall(left_side))
            else:
                location = 'nan'
        except:
            location = 'nan'
        
    return location

In [7]:
df['clean_firstline'] = df['firstline'].apply(lambda x: clean_firstline(x))
df['clean_content'] = df['content'].apply(lambda x: clean_content(x))
df['location'] = df['content'].apply(lambda x: extract_location(x))

df[['clean_firstline', 'clean_content', 'location']].head()

Unnamed: 0,clean_firstline,clean_content,location
0,A football fan fell from bleachers at an Oakla...,A football fan fell from bleachers at an Oakla...,
1,St. Louis Blues defenseman Ian Cole has been s...,St. Louis Blues defenseman Ian Cole has been s...,
2,Sergei Kostitsyn completed a hat-trick late in...,Sergei Kostitsyn completed a hat-trick late in...,
3,The slumping Denver Broncos won the AFC West d...,The slumping Denver Broncos won the AFC West d...,
4,Pope Benedict XVI will visit Cuba on March 26-...,Pope Benedict XVI will visit Cuba on March 26-...,


In [8]:
idx = np.random.choice(len(df), 10)
df.iloc[idx]

Unnamed: 0,connected_url,content,firstline,keywords,timestamp,title,clean_title,clean_firstline,clean_content,location
189253,http://www.reuters.com/article/2012/08/07/us-p...,"(Reuters) - Instead of just fries, Veronica Ro...","(Reuters) - Instead of just fries, Veronica Ro...",China;Puerto Rico;Taiwan;United Kingdom;United...,20120807,Puerto Rico turns to lottery to light up gray ...,Puerto Rico turns to lottery to light up gray ...,"Instead of just fries, Veronica Rodriguez got ...","Instead of just fries, Veronica Rodriguez got ...",
101501,http://www.reuters.com/article/2012/05/04/barr...,"LONDON, May 4 (Reuters) - Barry Callebaut, the...","LONDON, May 4 (Reuters) - Barry Callebaut, the...",Barry Callebaut;BARRYCALLEBAUT/CHOC(URGENT);Co...,20120504,Barry Callebaut sees double-digit sales growth...,Barry Callebaut sees double-digit sales growth...,"Barry Callebaut, the world’s largest chocolate...","Barry Callebaut, the world’s largest chocolate...",
251556,http://www.reuters.com/article/2012/10/16/live...,"* Feeders aided by live cattle, cash feeder ma...","* Feeders aided by live cattle, cash feeder ma...",United States;Jason Roose;LIVESTOCK;MARKETS/CM...,20121016,LIVESTOCK-US feeder cattle futures surge again...,"US feeder cattle futures surge again, supplies...",,U.S. feeder cattle futures rose for a third st...,CHICAGO
268843,http://www.reuters.com/article/2012/11/01/bt-i...,"LONDON, Nov 1 (Reuters) - BT cut its revenue o...","LONDON, Nov 1 (Reuters) - BT cut its revenue o...",United Kingdom;BT/(URGENT);Dividends;Results F...,20121101,BT cuts full-year revenue outlook,year revenue outlook,BT cut its revenue outlook on Thursday after E...,BT cut its revenue outlook on Thursday after E...,
160555,http://www.reuters.com/article/2012/07/11/idUS...,"Brad Miller, arguably the most sophisticated a...","Brad Miller, arguably the most sophisticated a...",,20120711,Why the eminent-domain plan doesn’t hurt secon...,Why the eminent-domain plan doesn’t hurt secon...,,,
80166,http://www.reuters.com/article/2012/04/12/us-n...,(Reuters) - The Calgary Flames and head coach ...,(Reuters) - The Calgary Flames and head coach ...,Brent Sutter;Jay Feaster;US;NHL;FLAMES;SUTTER;...,20120412,Flames and coach Brent Sutter agree to part ways,Flames and coach Brent Sutter agree to part ways,The Calgary Flames and head coach Brent Sutter...,The Calgary Flames and head coach Brent Sutter...,
52771,http://www.reuters.com/article/2012/03/09/newy...,March 9 (Reuters) - Former New York Times Co ...,March 9 (Reuters) - Former New York Times Co ...,United States;Arthur Sulzberger Jr.;Janet Robi...,20120309,New York Times paid former CEO $24 mln,New York Times paid former CEO $24 mln,Former New York Times Co Chief Executive Jane...,Former New York Times Co Chief Executive Jane...,
6132,http://www.reuters.com/article/2012/01/11/us-m...,LOS ANGELES (TheWrap.com) - New rules in the O...,LOS ANGELES (TheWrap.com) - New rules in the O...,Mark Blinch;US;MICHAELMOORE;OSCARS;Celebrities...,20120111,Michael Moore: new Oscar docs process is more ...,new Oscar docs process is more transparent,,,
128911,http://www.reuters.com/article/2012/06/05/usa-...,"By Philip Baillie ST. ANDREWS, Scotland, June ...",By Philip Baillie,United States;Ben Bernanke;Richard Fisher;Sand...,20120605,UPDATE 2-Fed's Fisher questions need for more ...,Fed's Fisher questions need for more policy ac...,,Policymakers at the U.S. Federal Reserve “must...,
312126,http://www.reuters.com/article/2012/12/20/mark...,"(Updates to close) * HSI +0.2 pct, H-shares -0...",(Updates to close),China;Hong Kong;United States;Jackson Wong;Chi...,20121220,"Hong Kong shares end near 17-mth highs, China ...","Hong Kong shares end near 17-mth highs, China ...",,Hong Kong shares ended at their highest level ...,HONG KONG


## Step3: Delete 'nan' in ['content'] column and save cleaned dataframe

In [9]:
def delete_nan(df):
    delete = []
    for i in df.index:
        if str(df['clean_content'].iloc[i]) == 'nan':
            delete.append(i)
        else:
            pass
    res = df.drop(delete)
    return res.reset_index(drop=True)

In [10]:
df = delete_nan(df)

In [11]:
df.head()

Unnamed: 0,connected_url,content,firstline,keywords,timestamp,title,clean_title,clean_firstline,clean_content,location
0,http://www.reuters.com/article/2012/01/02/us-f...,"OAKLAND, Calif (Reuters) - A football fan fell...","OAKLAND, Calif (Reuters) - A football fan fell...",Juan Salceto;Stephen Glatstein;US;FOOTBALL;ACC...,20120101,Oakland Raiders fan falls from bleachers after...,Oakland Raiders fan falls from bleachers after...,A football fan fell from bleachers at an Oakla...,A football fan fell from bleachers at an Oakla...,
1,http://www.reuters.com/article/2012/01/02/us-n...,NEW YORK (Reuters) - St. Louis Blues defensema...,NEW YORK (Reuters) - St. Louis Blues defensema...,Ian Cole;Justin Abdelkader;US;NHL;BLUES;COLE;N...,20120101,Blues' Cole suspended for three games,Blues' Cole suspended for three games,St. Louis Blues defenseman Ian Cole has been s...,St. Louis Blues defenseman Ian Cole has been s...,
2,http://www.reuters.com/article/2012/01/02/us-n...,(Reuters) - Sergei Kostitsyn completed a hat-t...,(Reuters) - Sergei Kostitsyn completed a hat-t...,Barry Trotz;Brendan Morrison;Jarome Iginla;Kev...,20120101,Kostitsyn's hat-trick helps Predators douse Fl...,Kostitsyn's hat-trick helps Predators douse Fl...,Sergei Kostitsyn completed a hat-trick late in...,Sergei Kostitsyn completed a hat-trick late in...,
3,http://www.reuters.com/article/2012/01/02/us-n...,(Reuters) - The slumping Denver Broncos won th...,(Reuters) - The slumping Denver Broncos won th...,Mark Leffingwell;Tim Tebow;US;NFL;BRONCOS;Spor...,20120101,"Broncos clinch AFC West, snap playoff drought","Broncos clinch AFC West, snap playoff drought",The slumping Denver Broncos won the AFC West d...,The slumping Denver Broncos won the AFC West d...,
4,http://www.reuters.com/article/2012/01/02/us-c...,HAVANA (Reuters) - Pope Benedict XVI will visi...,HAVANA (Reuters) - Pope Benedict XVI will visi...,Cuba;Mexico;Benedict XVI;Fidel Castro;Raul Cas...,20120101,Pope Benedict XVI to visit Cuba March 26-28,Pope Benedict XVI to visit Cuba March 26-28,Pope Benedict XVI will visit Cuba on March 26-...,Pope Benedict XVI will visit Cuba on March 26-...,


### Evaluation

In [22]:
idx = np.random.choice(len(df), 10)
df[['title', 'clean_title']].iloc[idx]

Unnamed: 0,title,clean_title
238648,WRAPUP 11-White House race goes down to the wire,White House race goes down to the wire
259137,Delta in talks for Virgin Atlantic stake: sources,Delta in talks for Virgin Atlantic stake
144415,Hong Kong reviews how rates are set amid Libor...,Hong Kong reviews how rates are set amid Libor...
87647,Canada pipeline delays won't stop China invest...,Canada pipeline delays won't stop China invest...
232267,Timchenko challenges Gazprom's gas export mono...,Timchenko challenges Gazprom's gas export mono...
87962,S. Sudan accuses Sudan of bombing in blow to t...,S. Sudan accuses Sudan of bombing in blow to t...
166220,Russian generals attack Medvedev over Georgia war,Russian generals attack Medvedev over Georgia war
67903,Russian vodka firm Synergy 2011 net profit up ...,Russian vodka firm Synergy 2011 net profit up ...
193603,Appeals court orders American imprisoned in Ni...,Appeals court orders American imprisoned in Ni...
36593,UPDATE 1-China must embrace market economy-Wor...,China must embrace market economy-World Bank


In [24]:
idx = np.random.choice(len(df), 10)
df[['content', 'clean_content']].iloc[idx]

Unnamed: 0,content,clean_content
110130,BEIRUT (Reuters) - Syrian rebels fighting to o...,Syrian rebels fighting to oust President Basha...
38189,"ST LOUIS, Missouri (Reuters) - Suspected torna...",Suspected tornadoes killed at least six people...
198619,* New season to open Oct. 1 under fixed-pricin...,Ivory Coast has paid more than 47 billion CFA ...
164816,(Reuters) - The semiautomatic handgun used in ...,The semiautomatic handgun used in the deadly a...
6059,"(Add details, background) BEIJING, Jan 12 (Reu...",China’s central bank reiterated its pledge to ...
162515,BERLIN (Reuters) - Politicians in Germany’s ru...,Politicians in Germany’s ruling centre-right c...
98978,NEW YORK (Reuters) - JPMorgan Chase & Co (JPM....,"JPMorgan Chase & Co (JPM.N), under scrutiny fo..."
114557,"SANTIAGO, June 8 (Reuters) - Chile’s governmen...",Chile’s government will send a bill to congres...
112685,"STONE TOWN, Zanzibar, June 6 (Reuters) - Royal...",Royal Dutch Shell is stepping up efforts to h...
193114,FRANKFURT (Reuters) - Deutsche Bank AG (DBKGn....,Deutsche Bank AG (DBKGn.DE) is targeting waste...


# All file : Step 1 ~ Step 3

In [25]:
usecols = ['connected_url', 'content', 'firstline', 'keywords', 'timestamp', 'title']
dat_path = '/home/junhyuki/DLproject/DAT'

for year in np.arange(2012, 2019):
    print('merging ' + str(year) + ' news articles')
    os.chdir(dat_path + '/daily_news/{}'.format(year))
    file_list = sorted(os.listdir())
    # merging as a one dataframe
    df = pd.read_csv(file_list[0], usecols = usecols)
    for file in file_list[1:]:
        tmp = pd.read_csv(file)
        df = df.append(tmp, ignore_index=True)
    print(str(year) + ' : ' + str(df.shape))
    
    print('Cleaning title ...')
    df['clean_title'] = df['title'].apply(lambda x: dash_preprocess(x))
    df['clean_title'] = df['clean_title'].apply(lambda x: colon_preprocess(x))

    print('Cleaning content ...')
    df['clean_content'] = df['content'].apply(lambda x: clean_content(x))

    print('Cleaning firstline ...')
    df['clean_firstline'] = df['firstline'].apply(lambda x: clean_firstline(x))

    print('Extracting location info ...')
    df['location'] = df['content'].apply(lambda x: extract_location(x))

    print('Delete unnecessary data ...')
    df = delete_nan(df)
    print('final output shape: ' + str(df.shape))
    
    df.to_csv(dat_path + '/daily_news/cleaned_news_'+ str(year) + '.csv', index=False, encoding='utf-8-sig')

merging 2012 news articles
2012 : (316301, 6)
Cleaning title ...
Cleaning content ...
Cleaning firstline ...
Extracting location info ...
Delete unnecessary data ...
final output shape: (277843, 10)
merging 2013 news articles
2013 : (264903, 6)
Cleaning title ...
Cleaning content ...
Cleaning firstline ...
Extracting location info ...
Delete unnecessary data ...
final output shape: (240771, 10)
merging 2014 news articles
2014 : (228702, 6)
Cleaning title ...
Cleaning content ...
Cleaning firstline ...
Extracting location info ...
Delete unnecessary data ...
final output shape: (197732, 10)
merging 2015 news articles
2015 : (876877, 6)
Cleaning title ...
Cleaning content ...
Cleaning firstline ...
Extracting location info ...
Delete unnecessary data ...
final output shape: (794284, 10)
merging 2016 news articles
2016 : (405292, 6)
Cleaning title ...
Cleaning content ...
Cleaning firstline ...
Extracting location info ...
Delete unnecessary data ...
final output shape: (360290, 10)
mergi