# Preprocessing: Character decoding

In [1]:
import numpy as np
import os
import pandas as pd
import string

from pprint import pprint

## Set input/output folders

In [2]:
data_in = '../data/02_merged_by_class_raw'

pprint(sorted(os.listdir(data_in)))

['clck_raw.csv', 'news_raw.csv']


In [3]:
data_out = '../data/03_preprocessed_00'

## Functions

In [4]:
# %load '../snippets/count_chars.py'
def count_chars(series):
    charset = dict()
    
    for item in series:
        for char in set(item):
            if char not in charset.keys():
                charset[char] = 0
            charset[char] += 1
            
    print(len(charset), 'characters found.\n')
            
    return charset

In [5]:
def get_titles_with_char(char, series):
    return pd.Series([item for item in series if char in item])

## Read data frames

### Clickbaits

In [6]:
clck = pd.read_csv(f'{data_in}/clck_raw.csv', index_col='Unnamed: 0')

clck = clck[(clck['src'] == 'cc2017') | \
            (clck['src'] == 'github') # | \
            # (clck['src'] == 'reddit')
           ]

clck.index = np.arange(clck.shape[0])
clck

Unnamed: 0,created_utc,num_comments,score,src,title
0,,,,github,"""1 Indian + 1 Indian = Unrelatable"": Televisio..."
1,,,,github,"""10 Best Foods to Eat When Youre Sick"" by Cath..."
2,,,,github,"""100 Best Jobs in America"""
3,,,,github,"""22 Jump Street"" Directors Call Jonah Hill's H..."
4,,,,github,"""22 Jump Street"" Is One Of The Most Self-Aware..."
5,,,,github,"""25 Cities Where Your Paycheck Stretches The F..."
6,,,,github,"""45 Unbelievable Behind-The-Scenes Stories Fro..."
7,,,,github,"""69 Love Songs"" Ranked By How Much They Make Y..."
8,,,,github,"""A Most Violent Year"" Pulls A Reverse ""Godfather"""
9,,,,github,"""A Potato Flew Around My Room"" Is The World's ..."


### News

In [7]:
news = pd.read_csv(f'{data_in}/news_raw.csv', index_col='Unnamed: 0')

news_reddit = news[news['src'] == 'reddit']

news = news[(news['src'] == 'cc2017') | \
            (news['src'] == 'github') # | \
            # (news['src'] == 'reddit')
           ]

news.index = np.arange(news.shape[0])
news

Unnamed: 0,created_utc,num_comments,score,src,title
0,,,,cc2017,"!Sdrawkcab: Missy Elliott, the Beatles and the..."
1,,,,github,""".asia"" domain applications near 300,000 on op..."
2,,,,github,"""7th Heaven"" television series comes to an end"
3,,,,github,"""Affluenza"" teen Ethan Couch may be jailed for..."
4,,,,cc2017,"""Apprentice"" contestant sues Trump for defamation"
5,,,,github,"""Archaeology and racism"" by Bill Stonehill - A..."
6,,,,cc2017,"""Big morale boost"": George H.W. Bush tweets im..."
7,,,,github,"""Bigoted woman"": controversial Gordon Brown re..."
8,,,,github,"""Black box"" found near crash site of Ethiopian..."
9,,,,cc2017,"""Bring it on"": Students sue Trump administrati..."


In [8]:
diff = clck.shape[0] - news.shape[0]
print(diff)

14989


In [9]:
news_reddit_cut = news_reddit.sort_values(by='score',
                                          ascending=False)[:diff]
news_reddit_cut

Unnamed: 0,created_utc,num_comments,score,src,title
396309,1.513275e+09,19534.0,143796.0,reddit,Net Neutrality Overturned
73871,1.513807e+09,14110.0,143286.0,reddit,"Apple admits it slows older iPhones, confirmin..."
133606,1.500574e+09,16430.0,122637.0,reddit,Chester Bennington of Linkin Park commits suicide
308465,1.509449e+09,5243.0,121868.0,reddit,Japanese firm gives non-smokers extra six days...
6450,1.521508e+09,7414.0,118992.0,reddit,#DeleteFacebook Movement Gains Steam After 50 ...
604363,1.485833e+09,17241.0,117474.0,reddit,U.S. President Donald Trump fired the federal ...
148112,1.511804e+09,3690.0,113353.0,reddit,Comcast quietly drops promise not to charge to...
307159,1.494367e+09,24075.0,109426.0,reddit,James Comey terminated as Director of FBI
308551,1.515280e+09,10400.0,107310.0,reddit,Japanese police want to question Logan Paul
394898,1.515183e+09,8596.0,106308.0,reddit,"Nearly 200,000 people have signed a petition t..."


In [10]:
news = pd.concat([news, news_reddit_cut],
                 sort=True, ignore_index=True) \
         .sort_values(by='title')

news.index = np.arange(news.shape[0])

news

Unnamed: 0,created_utc,num_comments,score,src,title
0,1.319750e+09,120.0,1311.0,reddit,\n'I Wish I Had Gone Out With Him Last Night':...
1,1.346819e+09,387.0,1507.0,reddit,\nAnonymous group has allegedly hacked Romney ...
2,1.298748e+09,375.0,1217.0,reddit,POLICE:State Capitol of Wisconsin: ‘We have b...
3,1.352131e+09,81.0,1592.0,reddit,The Electronic Frontier Foundation (EFF) file...
4,,,,cc2017,"!Sdrawkcab: Missy Elliott, the Beatles and the..."
5,1.420860e+09,280.0,1964.0,reddit,""" No more bullshit"" Policy declared by new Dis..."
6,1.450793e+09,7281.0,5665.0,reddit,"""... in our quest to be tolerant of everything..."
7,,,,github,""".asia"" domain applications near 300,000 on op..."
8,1.474300e+09,840.0,2083.0,reddit,"""60% of NBA players file for bankruptcy in the..."
9,,,,github,"""7th Heaven"" television series comes to an end"


In [11]:
clck.shape[0] == news.shape[0]  # same size (balanced!)

True

## Get list of all characters & create dictionary

### Clickbait

In [12]:
clck_charset = count_chars(clck['title'])
pprint(clck_charset)

158 characters found.

{' ': 58388,
 '!': 710,
 '"': 7158,
 '#': 656,
 '$': 358,
 '%': 135,
 '&': 327,
 "'": 13396,
 '(': 366,
 ')': 367,
 '*': 148,
 '+': 34,
 ',': 3988,
 '-': 4433,
 '.': 2676,
 '/': 133,
 '0': 4853,
 '1': 13705,
 '2': 9977,
 '3': 3878,
 '4': 2494,
 '5': 3732,
 '6': 2436,
 '7': 3525,
 '8': 1972,
 '9': 3379,
 ':': 2557,
 ';': 108,
 '=': 7,
 '?': 6047,
 '@': 33,
 'A': 33649,
 'B': 21268,
 'C': 22502,
 'D': 17495,
 'E': 11148,
 'F': 20247,
 'G': 13536,
 'H': 21286,
 'I': 22461,
 'J': 4920,
 'K': 6611,
 'L': 13539,
 'M': 19637,
 'N': 11047,
 'O': 20466,
 'P': 20223,
 'Q': 1942,
 'R': 13568,
 'S': 28040,
 'T': 43001,
 'U': 6090,
 'V': 4486,
 'W': 30966,
 'X': 177,
 'Y': 21271,
 'Z': 969,
 '[': 55,
 '\\': 2,
 ']': 55,
 '^': 1,
 '_': 4,
 'a': 55045,
 'b': 16337,
 'c': 32248,
 'd': 40733,
 'e': 57728,
 'f': 22158,
 'g': 32263,
 'h': 49340,
 'i': 54304,
 'j': 1038,
 'k': 20362,
 'l': 46534,
 'm': 29966,
 'n': 53524,
 'o': 56410,
 'p': 23574,
 'q': 834,
 'r': 54389,
 's': 55208

In [13]:
# clck_char_dict = {
# #     ' '
# #     '!'
# #     '"'
# #     '#': hashtag; number;
    
#     '$': ' USD ',
#     '%': ' percent ',  # cursing; percent;
    
# #     '&': &amp;; &quot;; 
# #     "'"
# #     '('
# #     ')'
# #     '*': cursing; title decoration
# #     '+': and more...; more than...;
# #     ','
# #     '-'
# #     '.'
# #     '/': an answer to the clickbait; or;
# #     '0' .. '9'
# #     ':'
# #     ';'
# #     '='
# #     '?'
# #     '@': cursing; twitter tags
# #     'A' .. 'Z'
# #     '[': reddit tag; additional description
    
# #     '\\': solved as a composite character
    
# #     ']': reddit tag; additional description
# #     '^': cursing

# #     '_':  # delete!
    
# #     'a' .. 'z'
# #     '|': additional info (source / copyright)

#     '~': '',
#     '\x80': '',
#     '\x93': '',
#     '\x94': '',
#     '\x98': '',
#     '\x99': '',
#     '\x9c': '',
#     '\x9d': '',
#     '\xa0': ' ',
#     '£': ' GBP ',
#     '¦': ' ',
#     '©': 'Copyright',  # partly solved as a composite character 'Ã©'
    
# #     '¯': solved as a composite character
    
#     '°': ' degrees ',
#     '¿': '',  # questionmark in Spanish
    
# #     'Â': part of different character encoding
# #     'Ã': part of different character encoding
    
# #     'Ü'
# #     'á'
# #     'â'
# #     'ã'
# #     'ä'
# #     'å'
# #     'ç'
# #     'è'
# #     'é'
# #     'ê'
# #     'ë'
# #     'í'
# #     'ñ'
# #     'ó'
# #     'ö'
# #     'û'
# #     'ü'
# #     'ğ'
    
#     '̃': '',  # n'̃ = ñ
#     '\u200b': '',
#     '\u200f': '',
#     '–': '–',
#     '—': '-',
#     '‘': '\'',
#     '’': '\'',
#     '“': '"',
#     '”': '"',
#     '•': '-',
#     '…': '...',
#     '\u202a': '',
#     '\u202c': '',
#     '€': ' EUR ',
#     '™': '',
    
# #     'ツ': solved as a composite character
    
#     '中': '',
#     '义': '',
#     '乌': '',
#     '元': '',
#     '到': '',
#     '化': '',
#     '国': '',
#     '多': '',
#     '文': '',
#     '来': '',
#     '欢': '',
#     '田': '',
#     '的': '',
#     '试': '',
#     '迎': '',
#     '验': '',
#     '\ue801': '',
#     '：': '',
    
#     # composite
#     '&amp;': '&',
#     '&quot;': '"',
#     'âÂ': '-',  # different encoding
#     'Ã©': 'é',  # different encoding | apply before '©'!
#     'Ã ': 'à',  # different encoding
    
#     # 'â': '-'  # ('â' in 'â') & ('â' in 'â') & ('â' in 'â')
    
#     'â': '\'',
#     'â': '"',
#     'â': '"',
    
# #     '¯\_(ツ)_/¯': 'happy',
#     'Dieses Video ist nicht verfügbar.': ''
# }

In [14]:
# '' --> ' '
clck_char_dict = {
#     ' '
#     '!'
#     '"'
#     '#': hashtag; number;
    
    '$': ' USD ',
    '%': ' percent ',  # cursing; percent;
    
#     '&': &amp;; &quot;; 
#     "'"
#     '('
#     ')'
#     '*': cursing; title decoration
#     '+': and more...; more than...;
#     ','
#     '-'
#     '.'
#     '/': an answer to the clickbait; or;
#     '0' .. '9'
#     ':'
#     ';'
#     '='
#     '?'
#     '@': cursing; twitter tags
#     'A' .. 'Z'
#     '[': reddit tag; additional description
    
#     '\\': solved as a composite character
    
#     ']': reddit tag; additional description
#     '^': cursing

#     '_':  # delete!
    
#     'a' .. 'z'
#     '|': additional info (source / copyright)

    '~': ' ',
    '\x80': ' ',
    '\x93': ' ',
    '\x94': ' ',
    '\x98': ' ',
    '\x99': ' ',
    '\x9c': ' ',
    '\x9d': ' ',
    '\xa0': ' ',
    '£': ' GBP ',
    '¦': ' ',
    '©': 'Copyright',  # partly solved as a composite character 'Ã©'
    
#     '¯': solved as a composite character
    
    '°': ' degrees ',
    '¿': ' ',  # questionmark in Spanish
    
#     'Â': part of different character encoding
#     'Ã': part of different character encoding
    
#     'Ü'
#     'á'
#     'â'
#     'ã'
#     'ä'
#     'å'
#     'ç'
#     'è'
#     'é'
#     'ê'
#     'ë'
#     'í'
#     'ñ'
#     'ó'
#     'ö'
#     'û'
#     'ü'
#     'ğ'
    
    '̃': ' ',  # n'̃ = ñ
    '\u200b': ' ',
    '\u200f': ' ',
    '–': '-',
    '—': '-',
    '‘': '\'',
    '’': '\'',
    '“': '"',
    '”': '"',
    '•': '-',
    '…': '...',
    '\u202a': ' ',
    '\u202c': ' ',
    '€': ' EUR ',
    '™': ' ',
    
#     'ツ': solved as a composite character
    
    '中': ' ',
    '义': ' ',
    '乌': ' ',
    '元': ' ',
    '到': ' ',
    '化': ' ',
    '国': ' ',
    '多': ' ',
    '文': ' ',
    '来': ' ',
    '欢': ' ',
    '田': ' ',
    '的': ' ',
    '试': ' ',
    '迎': ' ',
    '验': ' ',
    '\ue801': ' ',
    '：': ' ',
    
    # composite
    '&amp;': '&',
    '&quot;': '"',
    'âÂ': '-',  # different encoding
    'Ã©': 'é',  # different encoding | apply before '©'!
    'Ã ': 'à',  # different encoding
    
    # 'â': '-'  # ('â' in 'â') & ('â' in 'â') & ('â' in 'â')
    
    'â': '\'',
    'â': '"',
    'â': '"',
    
#     '¯\_(ツ)_/¯': 'happy',
    'Dieses Video ist nicht verfügbar.': ' '
}

In [15]:
for item in get_titles_with_char('', clck['title']):
#     print('-', item)
#     print('-', bytearray(item, 'latin-1').decode('utf-8'), '\n')
    continue

In [16]:
for item in get_titles_with_char('', clck['title']) \
                    .str.replace('', ''):
#     print('-', item)
    continue

### News

In [17]:
news_charset = count_chars(news['title'])
pprint(news_charset)

177 characters found.

{'\x03': 1,
 '\t': 1,
 '\n': 197,
 ' ': 58378,
 '!': 401,
 '"': 1531,
 '#': 93,
 '$': 1947,
 '%': 492,
 '&': 492,
 "'": 10749,
 '(': 399,
 ')': 394,
 '*': 54,
 '+': 63,
 ',': 14238,
 '-': 8322,
 '.': 8274,
 '/': 303,
 '0': 5929,
 '1': 5812,
 '2': 4764,
 '3': 2305,
 '4': 1931,
 '5': 2656,
 '6': 1743,
 '7': 2201,
 '8': 1635,
 '9': 1707,
 ':': 8450,
 ';': 1204,
 '<': 2,
 '=': 4,
 '>': 4,
 '?': 1121,
 '@': 23,
 'A': 19069,
 'B': 14063,
 'C': 19372,
 'D': 12065,
 'E': 8440,
 'F': 12585,
 'G': 8699,
 'H': 10009,
 'I': 13085,
 'J': 5109,
 'K': 5321,
 'L': 9427,
 'M': 14862,
 'N': 10782,
 'O': 9255,
 'P': 14923,
 'Q': 912,
 'R': 11257,
 'S': 23511,
 'T': 18480,
 'U': 8547,
 'V': 4089,
 'W': 11205,
 'X': 442,
 'Y': 3367,
 'Z': 689,
 '[': 156,
 ']': 156,
 '_': 1,
 '`': 2,
 'a': 56706,
 'b': 26178,
 'c': 43446,
 'd': 46501,
 'e': 57336,
 'f': 34532,
 'g': 37839,
 'h': 42897,
 'i': 56096,
 'j': 4451,
 'k': 23288,
 'l': 50147,
 'm': 39124,
 'n': 55945,
 'o': 55968,
 'p': 3541

In [18]:
# news_char_dict = {
#     '\x03': '',
#     '\t': '',
#     '\n': '',
    
# #     ' ':
# #     '!':
# #     '"':
# #     '#': hashtag; number;
    
#     '$': ' USD ',
#     '%': ' percent ',  # cursing; percent;
    
# #     '&': &amp;; &quot;; 
# #     "'"
# #     '('
# #     ')'
# #     '*': cursing; title decoration
# #     '+': and more...; more than...;
# #     ','
# #     '-'
# #     '.'
# #     '/': an answer to the clickbait; or;
# #     '0' .. '9'
# #     ':'
# #     ';'

#     '<': ' less than ',  # partly solved as a composite character '<3'

# #     '='
    
#     '>': '',

# #     '?'
# #     '@': cursing; twitter tags
# #     'A' .. 'Z'
# #     '[': reddit tag; additional description
# #     ']': reddit tag; additional description
# #     '_': subreddit link underscore
    
#     '`': '\'',
    
# #     'a' .. 'z'
    
# #     '{':  # delete!

# #     '|': additional info (source / copyright)

# #     '}':  # delete!
    
#     '~': '',
    
#     '\x80': '',
#     '\x94': '',
#     '\x97': '',
#     '\x98': '',
#     '\x99': '',
#     '\xa0': ' ',

# #     '¡': exclamation mark in Spanish

#     '£': ' GBP ',
    
# #     '¦': solved as a composite character
    
#     '©': 'Copyright',  # partly solved as a composite character 'Ã©'
#     '«': '',
#     '\xad': ' ',  # &nbsp;
#     '°': ' degrees ',

# #     '±': solved as a composite character
# #     '³': solved as a composite character
    
#     '·': '',

# #     '¸': solved as a composite character

#     'º': ' degrees ',
#     '½': 'half',
    
# #     'Á'
    
#     'Â': '',
    
# #     'Ã': solved as a composite character
    
# #     'Å'
# #     'É'
# #     'Í'
# #     'Î'
# #     'Ü'
# #     'ß'
# #     'à'
# #     'á'
    
# #     'â': solved as a composite character
    
# #     'ã'
# #     'ä'
# #     'å'

#     'æ': ' e ',  # 'encyclopædia'
    
# #     'ç'
# #     'è'
# #     'é'
# #     'ë'
# #     'í'
# #     'î'
# #     'ñ'
# #     'ó'
# #     'ô'
# #     'ö'
# #     '÷': title of Ed Sheran's album
# #     'ø'
# #     'ú'
# #     'ü'
# #     'ā'
    
# #     'Ă':  # delete
    
# #     'ć'
# #     'č'
# #     'ę'
# #     'ğ'
# #     'ī'
# #     'ł'
# #     'ń'
# #     'Ś'
# #     'ś'
# #     'Š'
# #     'š'
# #     'ū'
# #     'ž'
# #     'ș'
    
#     'ι': 'i',  # genιtalia
#     '\u200b': '',  # zero-width space
#     '\u200e': '',
#     '–': '-',
#     '—': '-',
#     '‘': '\'',
#     '’': '\'',
#     '“': '"',
#     '”': '"',
#     '…': '...',
#     '\u202f': '',
#     '′': '\'',
#     '€': ' EUR ',
#     '℞': 'Rx',  # medical prescription
#     '♥': 'love',
#     '\ufeff': '',
    
#     # composite
#     '<3': '',
    
#     'Ã¡': 'á',
#     'â¦': '...',
#     'Ã±': 'ñ',
#     'Ã³': 'ó',
#     'Ã¸': 'ø',
    
# #     '¯\_(ツ)_/¯': 'happy',  # ?
#     'Dieses Video ist nicht verfügbar.': '',

#     '&amp;': '&',
#     '&quot;': '"',
    
#     'âÂ': '-',  # different encoding
#     'Ã©': 'é',  # different encoding | apply before '©'!
#     'Ã ': 'à',  # different encoding
    
#     # 'â': '-'  # ('â' in 'â') & ('â' in 'â') & ('â' in 'â')
    
#     'â': '\'',
#     'â': '"',
#     'â': '"'
# }

In [19]:
# ' ' --> ' '
news_char_dict = {
    '\x03': ' ',
    '\t': ' ',
    '\n': ' ',
    
#     ' ':
#     '!':
#     '"':
#     '#': hashtag; number;
    
    '$': ' USD ',
    '%': ' percent ',  # cursing; percent;
    
#     '&': &amp;; &quot;; 
#     "'"
#     '('
#     ')'
#     '*': cursing; title decoration
#     '+': and more...; more than...;
#     ','
#     '-'
#     '.'
#     '/': an answer to the clickbait; or;
#     '0' .. '9'
#     ':'
#     ';'

    '<': ' less than ',  # partly solved as a composite character '<3'

#     '='
    
    '>': ' ',

#     '?'
#     '@': cursing; twitter tags
#     'A' .. 'Z'
#     '[': reddit tag; additional description
#     ']': reddit tag; additional description
#     '_': subreddit link underscore
    
    '`': '\'',
    
#     'a' .. 'z'
    
#     '{':  # delete!

#     '|': additional info (source / copyright)

#     '}':  # delete!
    
    '~': ' ',
    
    '\x80': ' ',
    '\x94': ' ',
    '\x97': ' ',
    '\x98': ' ',
    '\x99': ' ',
    '\xa0': ' ',

#     '¡': exclamation mark in Spanish

    '£': ' GBP ',
    
#     '¦': solved as a composite character
    
    '©': 'Copyright',  # partly solved as a composite character 'Ã©'
    '«': ' ',
    '\xad': ' ',  # &nbsp;
    '°': ' degrees ',

#     '±': solved as a composite character
#     '³': solved as a composite character
    
    '·': ' ',

#     '¸': solved as a composite character

    'º': ' degrees ',
    '½': 'half',
    
#     'Á'
    
    'Â': ' ',
    
#     'Ã': solved as a composite character
    
#     'Å'
#     'É'
#     'Í'
#     'Î'
#     'Ü'
#     'ß'
#     'à'
#     'á'
    
#     'â': solved as a composite character
    
#     'ã'
#     'ä'
#     'å'

    'æ': ' e ',  # 'encyclopædia'
    
#     'ç'
#     'è'
#     'é'
#     'ë'
#     'í'
#     'î'
#     'ñ'
#     'ó'
#     'ô'
#     'ö'
#     '÷': title of Ed Sheran's album
#     'ø'
#     'ú'
#     'ü'
#     'ā'
    
#     'Ă':  # delete
    
#     'ć'
#     'č'
#     'ę'
#     'ğ'
#     'ī'
#     'ł'
#     'ń'
#     'Ś'
#     'ś'
#     'Š'
#     'š'
#     'ū'
#     'ž'
#     'ș'
    
    'ι': 'i',  # genιtalia
    '\u200b': ' ',  # zero-width space
    '\u200e': ' ',
    '–': '-',
    '—': '-',  # ?
    '‘': '\'',
    '’': '\'',
    '“': '"',
    '”': '"',
    '…': '...',
    '\u202f': ' ',
    '′': '\'',
    '€': ' EUR ',
    '℞': 'Rx',  # medical prescription
    '♥': 'love',
    '\ufeff': ' ',
    
    # composite
    '<3': ' ',
    
    'Ã¡': 'á',
    'â¦': '...',
    'Ã±': 'ñ',
    'Ã³': 'ó',
    'Ã¸': 'ø',
    
#     '¯\_(ツ)_/¯': 'happy',  # ?
    'Dieses Video ist nicht verfügbar.': ' ',

    '&amp;': '&',
    '&quot;': '"',
    
    'âÂ': '-',  # different encoding
    'Ã©': 'é',  # different encoding | apply before '©'!
    'Ã ': 'à',  # different encoding
    
    # 'â': '-'  # ('â' in 'â') & ('â' in 'â') & ('â' in 'â')
    
    'â': '\'',
    'â': '"',
    'â': '"'
}

In [20]:
for item in get_titles_with_char('', news['title']):
#     print('-', bytearray(item, 'latin-1').decode('utf-8'))
#     print('-', item)
#     print()
    continue

In [21]:
for item in get_titles_with_char('', news['title']) \
                    .str.replace('', '***'):
#     print('-', item)
    continue

## Change characters

### Clickbait

In [22]:
clck_char_list = list(clck_char_dict.items())
clck_char_list.sort(key=lambda x: -len(x[0]))
pprint(clck_char_list)

[('Dieses Video ist nicht verfügbar.', ' '),
 ('&quot;', '"'),
 ('&amp;', '&'),
 ('â\x80\x94Â', '-'),
 ('â\x80\x99', "'"),
 ('â\x80\x9d', '"'),
 ('â\x80\x9c', '"'),
 ('Ã©', 'é'),
 ('Ã\xa0', 'à'),
 ('$', ' USD '),
 ('%', ' percent '),
 ('~', ' '),
 ('\x80', ' '),
 ('\x93', ' '),
 ('\x94', ' '),
 ('\x98', ' '),
 ('\x99', ' '),
 ('\x9c', ' '),
 ('\x9d', ' '),
 ('\xa0', ' '),
 ('£', ' GBP '),
 ('¦', ' '),
 ('©', 'Copyright'),
 ('°', ' degrees '),
 ('¿', ' '),
 ('̃', ' '),
 ('\u200b', ' '),
 ('\u200f', ' '),
 ('–', '-'),
 ('—', '-'),
 ('‘', "'"),
 ('’', "'"),
 ('“', '"'),
 ('”', '"'),
 ('•', '-'),
 ('…', '...'),
 ('\u202a', ' '),
 ('\u202c', ' '),
 ('€', ' EUR '),
 ('™', ' '),
 ('中', ' '),
 ('义', ' '),
 ('乌', ' '),
 ('元', ' '),
 ('到', ' '),
 ('化', ' '),
 ('国', ' '),
 ('多', ' '),
 ('文', ' '),
 ('来', ' '),
 ('欢', ' '),
 ('田', ' '),
 ('的', ' '),
 ('试', ' '),
 ('迎', ' '),
 ('验', ' '),
 ('\ue801', ' '),
 ('：', ' ')]


In [23]:
for item in clck_char_list:
    clck['title'] = clck['title'].str.replace(item[0], item[1])

pprint(count_chars(clck['title']))

108 characters found.

{' ': 58389,
 '!': 710,
 '"': 7578,
 '#': 656,
 '&': 262,
 "'": 15260,
 '(': 366,
 ')': 367,
 '*': 148,
 '+': 34,
 ',': 3988,
 '-': 4617,
 '.': 2699,
 '/': 133,
 '0': 4853,
 '1': 13705,
 '2': 9977,
 '3': 3878,
 '4': 2494,
 '5': 3732,
 '6': 2436,
 '7': 3525,
 '8': 1972,
 '9': 3379,
 ':': 2557,
 ';': 29,
 '=': 7,
 '?': 6047,
 '@': 33,
 'A': 33649,
 'B': 21294,
 'C': 22502,
 'D': 17743,
 'E': 11149,
 'F': 20247,
 'G': 13571,
 'H': 21286,
 'I': 22461,
 'J': 4920,
 'K': 6611,
 'L': 13539,
 'M': 19637,
 'N': 11047,
 'O': 20466,
 'P': 20246,
 'Q': 1942,
 'R': 13570,
 'S': 28206,
 'T': 43001,
 'U': 6352,
 'V': 4484,
 'W': 30966,
 'X': 177,
 'Y': 21271,
 'Z': 969,
 '[': 55,
 '\\': 2,
 ']': 55,
 '^': 1,
 '_': 4,
 'a': 55043,
 'b': 16336,
 'c': 32310,
 'd': 40732,
 'e': 57732,
 'f': 22157,
 'g': 32262,
 'h': 49339,
 'i': 54303,
 'j': 1038,
 'k': 20362,
 'l': 46534,
 'm': 29959,
 'n': 53541,
 'o': 56407,
 'p': 23659,
 'q': 769,
 'r': 54396,
 's': 55207,
 't': 53665,
 'u': 46

In [24]:
get_titles_with_char('¯\_(ツ)_/¯', clck['title'])

0    32 ¯\_(ツ)_/¯ Headlines From British Local News...
1                      How ¯\_(ツ)_/¯ Are You Actually?
dtype: object

In [25]:
clck['title'] = clck['title'].str.strip()

In [26]:
clck['len'] = clck['title'].str.len()

#### Delete 0-length titles

In [27]:
clck = clck.drop(list(clck[clck['len'] == 0].index))

In [28]:
clck.sort_values(by='len')

Unnamed: 0,created_utc,num_comments,score,src,title,len
36940,,,,github,O_o,3
50726,,,,cc2017,Vine,4
35024,,,,github,Lips.,5
21883,,,,github,BFWKND,6
22687,,,,cc2017,Buzzfeed,8
41012,,,,github,Taco Pie,8
37010,,,,cc2017,Odd Lots,8
40897,,,,github,Sure Jan,8
34130,,,,github,Just Tags,9
25354,,,,cc2017,Decrypted,9


### News

In [29]:
news_char_list = list(news_char_dict.items())
news_char_list.sort(key=lambda x: -len(x[0]))
news_char_list

[('Dieses Video ist nicht verfügbar.', ' '),
 ('&quot;', '"'),
 ('&amp;', '&'),
 ('â\x80\x94Â', '-'),
 ('â\x80¦', '...'),
 ('â\x80\x99', "'"),
 ('â\x80\x9d', '"'),
 ('â\x80\x9c', '"'),
 ('<3', ' '),
 ('Ã¡', 'á'),
 ('Ã±', 'ñ'),
 ('Ã³', 'ó'),
 ('Ã¸', 'ø'),
 ('Ã©', 'é'),
 ('Ã\xa0', 'à'),
 ('\x03', ' '),
 ('\t', ' '),
 ('\n', ' '),
 ('$', ' USD '),
 ('%', ' percent '),
 ('<', ' less than '),
 ('>', ' '),
 ('`', "'"),
 ('~', ' '),
 ('\x80', ' '),
 ('\x94', ' '),
 ('\x97', ' '),
 ('\x98', ' '),
 ('\x99', ' '),
 ('\xa0', ' '),
 ('£', ' GBP '),
 ('©', 'Copyright'),
 ('«', ' '),
 ('\xad', ' '),
 ('°', ' degrees '),
 ('·', ' '),
 ('º', ' degrees '),
 ('½', 'half'),
 ('Â', ' '),
 ('æ', ' e '),
 ('ι', 'i'),
 ('\u200b', ' '),
 ('\u200e', ' '),
 ('–', '-'),
 ('—', '-'),
 ('‘', "'"),
 ('’', "'"),
 ('“', '"'),
 ('”', '"'),
 ('…', '...'),
 ('\u202f', ' '),
 ('′', "'"),
 ('€', ' EUR '),
 ('℞', 'Rx'),
 ('♥', 'love'),
 ('\ufeff', ' ')]

In [30]:
for item in news_char_list:
    news['title'] = news['title'].str.replace(item[0], item[1])

pprint(count_chars(news['title']))

131 characters found.

{' ': 58378,
 '!': 401,
 '"': 1833,
 '#': 93,
 '&': 492,
 "'": 13764,
 '(': 399,
 ')': 394,
 '*': 54,
 '+': 63,
 ',': 14238,
 '-': 8696,
 '.': 8288,
 '/': 303,
 '0': 5929,
 '1': 5812,
 '2': 4764,
 '3': 2305,
 '4': 1931,
 '5': 2656,
 '6': 1743,
 '7': 2201,
 '8': 1635,
 '9': 1707,
 ':': 8450,
 ';': 1139,
 '=': 4,
 '?': 1121,
 '@': 23,
 'A': 19069,
 'B': 14132,
 'C': 19372,
 'D': 13581,
 'E': 8456,
 'F': 12585,
 'G': 8784,
 'H': 10009,
 'I': 13085,
 'J': 5109,
 'K': 5321,
 'L': 9427,
 'M': 14862,
 'N': 10782,
 'O': 9255,
 'P': 15012,
 'Q': 912,
 'R': 11272,
 'S': 24689,
 'T': 18480,
 'U': 10162,
 'V': 4085,
 'W': 11205,
 'X': 442,
 'Y': 3367,
 'Z': 689,
 '[': 156,
 ']': 156,
 '_': 1,
 'a': 56706,
 'b': 26174,
 'c': 43536,
 'd': 46497,
 'e': 57344,
 'f': 34530,
 'g': 37837,
 'h': 42896,
 'i': 56095,
 'j': 4451,
 'k': 23288,
 'l': 50148,
 'm': 39116,
 'n': 55978,
 'o': 55966,
 'p': 35551,
 'q': 2605,
 'r': 55882,
 's': 56042,
 't': 55487,
 'u': 44124,
 'v': 22652,
 'w

In [31]:
get_titles_with_char('_', news['title'])

0    What concerns me most about r/The_Donald is th...
dtype: object

In [32]:
news['title'] = news['title'].str.strip()

In [33]:
news['len'] = news['title'].str.len()

#### Delete 0-length titles

In [34]:
news = news.drop(list(clck[clck['len'] == 0].index))

In [35]:
news.sort_values(by='len')

Unnamed: 0,created_utc,num_comments,score,src,title,len
35626,,,,github,No,2
54982,,,,cc2017,Vine,4
10111,,,,cc2017,CNNgo,5
20659,,,,github,GOLF;,5
57686,,,,cc2017,Yahoo,5
21576,,,,cc2017,Google,6
14460,,,,cc2017,Debrief,7
47007,,,,github,TENNIS;,7
22301,,,,github,HOCKEY;,7
7135,,,,cc2017,BBC Two,7


## Delete unnecessary rows

### Clickbait

In [36]:
clck_del_char = {
    '\\',
    '_',
    '¯',
    'ツ'
}

In [37]:
indices = set()

for char in clck_del_char:
    for i in clck.index:
        if char in clck['title'][i]:
            indices.add(i)
            
indices = sorted(list(indices))
pprint(indices)

[15931, 31917, 32920, 36940]


In [38]:
for i in indices:
    print('-', clck['title'][i])

- 32 ¯\_(ツ)_/¯ Headlines From British Local Newspapers
- How ¯\_(ツ)_/¯ Are You Actually?
- In light of recent events, it is clear the_donald is not welcome on Reddit. Therefore, we have decided
- O_o


In [39]:
clck.shape[0]

58394

In [40]:
clck = clck.drop(labels=indices, axis=0)

In [41]:
clck

Unnamed: 0,created_utc,num_comments,score,src,title,len
0,,,,github,"""1 Indian + 1 Indian = Unrelatable"": Televisio...",64
1,,,,github,"""10 Best Foods to Eat When Youre Sick"" by Cath...",59
2,,,,github,"""100 Best Jobs in America""",26
3,,,,github,"""22 Jump Street"" Directors Call Jonah Hill's H...",80
4,,,,github,"""22 Jump Street"" Is One Of The Most Self-Aware...",64
5,,,,github,"""25 Cities Where Your Paycheck Stretches The F...",54
6,,,,github,"""45 Unbelievable Behind-The-Scenes Stories Fro...",83
7,,,,github,"""69 Love Songs"" Ranked By How Much They Make Y...",59
8,,,,github,"""A Most Violent Year"" Pulls A Reverse ""Godfather""",49
9,,,,github,"""A Potato Flew Around My Room"" Is The World's ...",64


### News

In [42]:
news_del_char = {
    '{',
    '}',
    'Ă'
}

In [43]:
indices = set()

for char in news_del_char:
    for i in news.index:
        if char in news['title'][i]:
            indices.add(i)
            
indices = sorted(list(indices))
pprint(indices)

[41821, 58133]


In [44]:
for i in indices:
    print('-', news['title'][i])

- Rival gangs Crips, Bloods and EsĂŠs declare peace in Los Angeles
- {{ videoEntry.nbcsports USD titleSEO || videoEntry.title }}


In [45]:
news.shape[0]

58396

In [46]:
news = news.drop(labels=indices, axis=0)

In [47]:
news

Unnamed: 0,created_utc,num_comments,score,src,title,len
0,1.319750e+09,120.0,1311.0,reddit,'I Wish I Had Gone Out With Him Last Night': ...,274
1,1.346819e+09,387.0,1507.0,reddit,Anonymous group has allegedly hacked Romney ta...,88
2,1.298748e+09,375.0,1217.0,reddit,POLICE:State Capitol of Wisconsin: 'We have be...,226
3,1.352131e+09,81.0,1592.0,reddit,The Electronic Frontier Foundation (EFF) filed...,233
4,,,,cc2017,"!Sdrawkcab: Missy Elliott, the Beatles and the...",70
5,1.420860e+09,280.0,1964.0,reddit,""" No more bullshit"" Policy declared by new Dis...",70
6,1.450793e+09,7281.0,5665.0,reddit,"""... in our quest to be tolerant of everything...",172
7,,,,github,""".asia"" domain applications near 300,000 on op...",67
8,1.474300e+09,840.0,2083.0,reddit,"""60 percent of NBA players file for bankruptc...",133
9,,,,github,"""7th Heaven"" television series comes to an end",46


## Delete items to balance the datasets

In [48]:
diff = clck.shape[0] - news.shape[0]
diff

-4

In [49]:
if diff < 0:
    news = news.drop(
        list(news[news['src'] == 'reddit'].sort_values(by='score')[:abs(diff)].index)
    )
elif diff > 0:
    clck = clck.drop(
        np.random.choice(np.arange(clck.shape[0]), size=abs(diff), replace=False)
    )

In [50]:
clck.shape == news.shape  # check if the datasets are balanced

True

## Output data to csv format

In [51]:
clck['title'].to_csv(f'{data_out}/clck.csv', index=None)
news['title'].to_csv(f'{data_out}/news.csv', index=None)

In [52]:
clck.to_csv(f'{data_out}/clck_full.csv', index=None)
news.to_csv(f'{data_out}/news_full.csv', index=None)