# GitHub: Merge titles

- <a href="https://github.com/bhargaviparanjape/clickbait">[bhargaviparanjape] clickbait</a>
- <a href="https://github.com/LorenzoNorcini/Clickbait-Detector">[LorenzoNorcini] Clickbait-Detector</a>
- <a href="https://github.com/pfrcks/clickbait-detection">[pfrcks] clickbait-detection</a>
- <a href="https://github.com/saurabhmathur96/clickbait-detector">[saurabhmathur96] clickbait-detector</a>
- <a href="https://github.com/ventodiventu/clickbait-ml">[ventodiventu] clickbait-ml</a>

In [1]:
import os
import pandas as pd

## Functions

In [2]:
def merge_data(folder):
    merged_list = []
    
    for file in sorted(os.listdir(f'{folder}')):
        print(file)
    
        with open(f'{folder}/{file}', 'r') as f:
            for next_line in f.readlines():
                next_line = next_line.strip()
                if len(next_line) > 1:
                    merged_list.append(next_line)
                
    return merged_list

## Set input folder

In [3]:
data_in = '../data/00_raw/GitHub'

## Merge titles

In [4]:
clck_list = merge_data(f'{data_in}/_clickbait')

print(len(clck_list))

[LorenzoNorcini] clickbait.dat
[bhargaviparanjape] clickbait_data
[pfrcks] clickbait
[saurabhmathur96] clickbait-reddit.txt
[saurabhmathur96] clickbait-top-reddit.txt
[saurabhmathur96] clickbait.txt
[ventodiventu] buzzfeed_dataset.txt
84175


In [5]:
news_list = merge_data(f'{data_in}/_genuine')

print(len(news_list))

[LorenzoNorcini] news.dat
[bhargaviparanjape] non_clickbait_data
[pfrcks] not-clickbait
[saurabhmathur96] genuine-reddit.txt
[saurabhmathur96] genuine.txt
[saurabhmathur96] news-reddit.txt
34685


### [peterldowns] clickbait-classifier

In [6]:
files = ['buzzfeed', 'clickhole', 'dose', 'nytimes']

for file in files:
    df = pd.read_json(f'{data_in}/[peterldowns] clickbait-classifier/data/{file}.json')
    df['article_subject'] = df['article_title'].apply(lambda x: x.strip())

    news_new = list(df[df['clickbait'] == 0]['article_title'].str.strip())
    clck_new = list(df[df['clickbait'] == 1]['article_title'].str.strip())
    
    news_list += news_new
    clck_list += clck_new

    print(f'{file:15s} | Total: {df.shape[0]:>5d} | 0: {len(clck_new):>5d} | 1: {len(news_new):>5d}')

buzzfeed        | Total:   367 | 0:   367 | 1:     0
clickhole       | Total:   547 | 0:   547 | 1:     0
dose            | Total:  2182 | 0:  2182 | 1:     0
nytimes         | Total:  3104 | 0:     0 | 1:  3104


## Unique titles & Create DataFrames

In [7]:
clck_list = sorted(list(set(clck_list)))

clck_df = pd.DataFrame(clck_list, columns=['Title'])
display(clck_df)

Unnamed: 0,Title
0,"""1 Indian + 1 Indian = Unrelatable"": Televisio..."
1,"""10 Best Foods to Eat When Youre Sick"" by Cath..."
2,"""100 Best Jobs in America"""
3,"""22 Jump Street"" Directors Call Jonah Hill's H..."
4,"""22 Jump Street"" Is One Of The Most Self-Aware..."
5,"""25 Cities Where Your Paycheck Stretches The F..."
6,"""45 Unbelievable Behind-The-Scenes Stories Fro..."
7,"""69 Love Songs"" Ranked By How Much They Make Y..."
8,"""A Most Violent Year"" Pulls A Reverse ""Godfather"""
9,"""A Potato Flew Around My Room"" Is The World's ..."


In [8]:
news_list = sorted(list(set(news_list)))

news_df = pd.DataFrame(news_list, columns=['Title'])
display(news_df)

Unnamed: 0,Title
0,""".asia"" domain applications near 300,000 on op..."
1,"""7th Heaven"" television series comes to an end"
2,"""Affluenza"" teen Ethan Couch may be jailed for..."
3,"""Archaeology and racism"" by Bill Stonehill - A..."
4,"""Bigoted woman"": controversial Gordon Brown re..."
5,"""Black box"" found near crash site of Ethiopian..."
6,"""Camp Casey"" moves to safer land, as Mother's ..."
7,"""Children of Men"" wins Scripter Award for writing"
8,"""Civil defence"" thwarts Israeli air strike on ..."
9,"""Creationism and intelligent design have no pl..."


## Output data to csv

In [9]:
clck_df.to_csv(f'{data_in}/00_github_clck.csv', index=None, header=None)

In [10]:
news_df.to_csv(f'{data_in}/00_github_news.csv', index=None, header=None)