# Reddit: Merge titles

- <a href="https://www.reddit.com/r/inthenews">/r/inthenews</a>
- <a href="https://www.reddit.com/r/news">/r/news</a>
- <a href="https://www.reddit.com/r/savedyouaclick">/r/savedyouaclick</a>

In [1]:
import numpy as np
import os
import pandas as pd

from pprint import pprint

## Set input folder

In [2]:
data_in = '../data/00_raw/pushshift/'

pprint(sorted(os.listdir(data_in)))

['_reddit_clck.csv',
 '_reddit_news.csv',
 'inthenews',
 'inthenews_all.csv',
 'inthenews_aoi.csv',
 'news',
 'news_gt1_all.csv',
 'news_gt1_aoi.csv',
 'savedyouaclick',
 'savedyouaclick_all.csv',
 'savedyouaclick_aoi.csv']


## Choose attributes of interest

In [3]:
aoi = ['created_utc',
       'num_comments',
       'score',
       'title']

## Clickbaits

### /r/savedyouaclick

In [4]:
df = pd.read_csv(f'{data_in}/savedyouaclick_all.csv',
                 index_col='Unnamed: 0')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,author_id,banned_by,...,subreddit_id,subreddit_subscribers,subreddit_type,thumbnail,thumbnail_height,thumbnail_width,title,url,whitelist_status,wls
0,JerseysFinest,,,,,,,,,,...,t5_323r3,,,http://a.thumbs.redditmedia.com/B_aQNVtrfjqzAO...,,,The best American soccer player you’ve never h...,http://ftw.usatoday.com/2014/07/perry-kitchen-...,,
1,kuhnie,,,,,,,,,,...,t5_323r3,,,http://b.thumbs.redditmedia.com/RJjRd-4d0YwRbT...,,,"How ""The Vampire Diaries"" Co-Creator Julie Ple...",http://www.cosmopolitan.com/lifestyle/a47035/o...,,
2,kuhnie,,,,,,,,,,...,t5_323r3,,,http://b.thumbs.redditmedia.com/sVbvZlADXGSXbN...,,,Who Ranks #1 in Business Schools? | Harvard,http://www.bloomberg.com/news/videos/2015-10-2...,,
3,biscram,,,,,,,,,,...,t5_323r3,,,default,,,Honeymooners forced to splurge $1600 after Syd...,http://www.theage.com.au/nsw/honeymooners-forc...,,
4,[deleted],,,,,,,,,,...,t5_323r3,,,default,,,"Here's the ONE Detail Everyone Missed in ""Home...",http://archive.is/8RlYi/image,,


#### Split the titles from the answers

In [5]:
df['title'] = pd.Series([item[0].strip() if len(item[0].strip()) > 1 else np.nan
                         for item in df[aoi]['title'].str.split('|')])

#### Output data to csv format

In [6]:
df[aoi].to_csv(f'{data_in}/00_reddit_clck.csv')

## News

### Read data

#### /r/inthenews

In [7]:
df1 = pd.read_csv(f'{data_in}/inthenews_aoi.csv',
                  index_col='Unnamed: 0')
df1.head()

Unnamed: 0,created_utc,num_comments,score,title
0,1290061764,0,1,Alleged ‘Merchant of Death’ Pleads Not Guilty ...
1,1265051681,0,1,Bears should consider signing Julius Peppers
2,1264921014,0,2,U.S. Speeding Up Missile Defenses in Persian G...
3,1321002541,0,1,Mark Heisler: Joe Paterno Gets Due Process of ...
4,1310110706,0,1,Hacking Scandal Leads to British Tabloid’s Dem...


#### /r/news

In [8]:
df2 = pd.read_csv(f'{data_in}/news_gt1_aoi.csv',
                  index_col='Unnamed: 0')
df2.head()

Unnamed: 0,created_utc,num_comments,score,title
0,1533212000.0,86.0,4.0,Pope Francis says the death penalty is always ...
1,1533211000.0,2.0,4.0,Pope Francis says death penalty is 'inadmissible'
2,1533211000.0,234.0,3.0,World's most prestigious maths medal is stolen...
3,1533211000.0,20.0,5.0,DRC: New Ebola virus outbreak days after previ...
4,1533210000.0,383.0,13.0,Radio host Art Bell died of accidental drug ov...


### Merge data

In [9]:
df = pd.concat([df1, df2], sort=True, ignore_index=True)
df.head()

Unnamed: 0,created_utc,num_comments,score,title
0,1290062000.0,0.0,1.0,Alleged ‘Merchant of Death’ Pleads Not Guilty ...
1,1265052000.0,0.0,1.0,Bears should consider signing Julius Peppers
2,1264921000.0,0.0,2.0,U.S. Speeding Up Missile Defenses in Persian G...
3,1321003000.0,0.0,1.0,Mark Heisler: Joe Paterno Gets Due Process of ...
4,1310111000.0,0.0,1.0,Hacking Scandal Leads to British Tabloid’s Dem...


### Output data to csv format

In [10]:
df[aoi].to_csv(f'{data_in}/00_reddit_news.csv')