## Preprocess the data
- Tokenize
- Get rid of punctuations... maybe keep hashtags, emoticons?
- Remove numbers, links, stopwords
- Convert uppercase to lowercase

In [77]:
import os
import re
import json
import pprint

from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import *

tweet_tok = TweetTokenizer()
port_stem = PorterStemmer()

sample_comments_path = os.path.join("..","data","spring_2017",
                               "boku_no_hero_academia_2nd_season",
                               "ep_1.json")

In [25]:
discussion_thread = json.load(open(sample_comments_path))

In [70]:
# Will need to find a better stop word removal method
# Use this for now
stop_words = [
    "a", "above", "all", "am", "an", "and", "any", "are", "as", "at",
    "be", "been", "but", "by",
    "can", "could",
    "did", "do", "does",
    "each",
    "few", "for", "from",
    "had", "has", "have", "he", "her", "here", "him","himself", "his", "how",
    "i", "if", "in", "is", "it", "its", "itself",
    "just",
    "let",
    "me", "my", "myself",
    "no", "nor",
    "of", "off", "on", "once", "only", "or", "our", "ourselves", "out",
    "she", "so", "such",
    "than", "that", "the", "their", "them", "then", "they", "this", "those", "to", "too",
    "under", "up",
    "very",
    "was", "we", "what", "when", "where", "who", "why", "with", "would",
    "you", "your", "yourself"    
]

# How to exploit emoticons? For now discard all non-characters
symbols = ["!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "-", "=",
           "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "_", "+",
           "<", ">", ",", ".", "\"", "'", "?", "/", "\\", ";", ":","~",
           "`"]

# How to detect if the token is a combination of non-characters?
# Want to remove non-alpha characters
# https://stackoverflow.com/questions/1276764/stripping-everything-but-alphanumeric-chars-from-a-string-in-python
# Note: See 1st answer, 2nd comment
non_alpha_pattern = "[^a-zA-Z]+"

# How to detect if it's a link?
# Don't delete link... make the link a feature?
link_pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

In [78]:
discussion_links = []
preprocessed_comments = []
for comment in discussion_thread["comments"]:
    tokenized_comment = tweet_tok.tokenize(comment["text"])

    preprocessed_comment = []
    for tok in tokenized_comment:
        if re.match(link_pattern, tok):
            discussion_links.append(tok)
        else:
            if len(tok) <= 2:
                continue
            elif tok in stop_words:
                continue
            else:
                preprocessed_comment.append(tok.lower())

    preprocessed_comment = [port_stem.stem(tok) for tok in preprocessed_comment]
    preprocessed_comment = [re.sub(non_alpha_pattern, "", tok) for tok in preprocessed_comment]

    clean_comment = []
    for tok in preprocessed_comment:
        if len(tok) <= 2:
            continue
        elif tok in stop_words:
            continue
        else:
            clean_comment.append(tok)

    preprocessed_comments.append(clean_comment)


pprint.pprint(preprocessed_comments)
# pprint.pprint(discussion_links)
    

[['midnight',
  'fangirl',
  'over',
  'hero',
  'name',
  'ador',
  'best',
  'frog',
  'now',
  'best',
  'froppi'],
 ['froppi', 'life'],
 ['frog', 'girl', 'best', 'girl', 'hail', 'froppi'],
 ['fropsday', 'dude'],
 ['best', 'frog', 'alway', 'froppi', 'taken', 'until', 'now', 'rest', 'find'],
 ['get', 'sloppi', 'froppi'],
 [],
 ['youv', 'wait', 'month', 'post', 'havent'],
 ['more', 'like', 'save', 'forgot', 'about', 'until', 'now'],
 ['yiss', 'mutha', 'fuckin', 'froppi'],
 ['wait',
  'midoriya',
  'name',
  'right',
  'deku',
  'probabl',
  'best',
  'learn',
  'use',
  'quirk'],
 ['expect', 'name', 'plu', 'ultra'],
 ['think', 'some', 'might'],
 ['alt', 'right', 'tho'],
 ['dont',
  'blame',
  'uraraka',
  'call',
  'turd',
  'blossom',
  'becaus',
  'thought',
  'cute',
  'tell',
  'agenc',
  'make',
  'poster',
  'size',
  'build',
  'turd',
  'blossom'],
 ['right', 'except', 'hed', 'say', 'light'],
 [],
 ['honestli', 'expect', 'name', 'one'],
 ['hero',
  'name',
  'might',
  'suitab

  'white',
  'hair',
  'holi',
  'shit',
  'horikoshi',
  'name',
  'after',
  'clint',
  'eastwood',
  'gran',
  'torino'],
 ['wrong',
  'didnt',
  'watch',
  'star',
  'war',
  'read',
  'gran',
  'torino',
  'thought',
  'gran',
  'tesoro',
  'one',
  'piec',
  'gold'],
 ['think',
  'gran',
  'turismo',
  'didnt',
  'know',
  'gran',
  'torino',
  'nice',
  'catch'],
 ['horikoshi',
  'said',
  'inspir',
  'yoda',
  'big',
  'star',
  'war',
  'fan',
  'lot',
  'star',
  'war',
  'easter',
  'egg',
  'boku',
  'hero'],
 ['further', 'seri', 'there', 'boondock', 'saint', 'refer', 'most', 'like'],
 ['see',
  'get',
  'lot',
  'differ',
  'respons',
  'right',
  'horikoshi',
  'absolut',
  'master',
  'multifacet',
  'name'],
 ['also', 'author', 'base', 'charact', 'yoda', 'star', 'war'],
 ['think', 'master', 'korin', 'yoda', 'work'],
 ['also',
  'refer',
  'car',
  'which',
  'also',
  'appear',
  'movi',
  'ford',
  'gran',
  'torino',
  'use',
  'realli',
  'famou',
  'car',
  'usa',
 

  'becaus',
  'some',
  'anim',
  'year',
  'old',
  'sound',
  'like',
  'theyr'],
 ['gener',
  'start',
  'new',
  'show',
  'there',
  'dub',
  'avail',
  'ill',
  'watch',
  'both',
  'dub',
  'sub',
  'first',
  'coupl',
  'episod',
  'decid',
  'which',
  'one',
  'ill',
  'continu',
  'watch',
  'dub',
  'garbag',
  'pretti',
  'easi',
  'tell',
  'within',
  'coupl',
  'episod',
  'definit',
  'nicer',
  'abl',
  'watch',
  'without',
  'eye',
  'glu',
  'bottom',
  'screen',
  'sinc',
  'often',
  'watch',
  'anim',
  'while',
  'other',
  'thing',
  'far',
  'mha',
  'one',
  'actual',
  'gone',
  'back',
  'watch',
  'both',
  'version',
  'becaus',
  'much',
  'enjoy'],
 ['love', 'fmab'],
 ['enjoy',
  'fma',
  'not',
  'much',
  'mha',
  'watch',
  'fma',
  'dub',
  'mayb',
  'ill',
  'watch',
  'sub',
  'ever',
  'mood',
  'rewatch',
  'though',
  'gotta',
  'get',
  'through',
  'backlog',
  'first'],
 ['nice', 'think', 'didnt', 'watch', 'dub', 'mention', 'way'],
 ['yeah'

 ['dont', 'know', 'look', 'around', 'late', 'anim', 'commun', 'littl', 'weird'],
 ['nonsens', 'not', 'weird'],
 ['usernam', 'usernam', 'not', 'check'],
 ['notmymegumin'],
 ['nah', 'got', 'dynamight'],
 ['mayb', 'hiroshima', 'nagasaki', 'hide'],
 ['victorbomb'],
 ['shit', 'actual', 'pretti', 'good'],
 ['more', 'partial', 'bright'],
 ['haha', 'like', 'one'],
 ['brite', 'need', 'new', 'costum'],
 ['color', 'want', 'paint', 'world', 'blood', 'red'],
 ['should', 'name', 'crimson', 'demon', 'honour', 'megumin', 'meguminthumbsup'],
 ['actual',
  'friggin',
  'heresi',
  'megumin',
  'pov',
  'bakug',
  'use',
  'deton',
  'magic',
  'instead',
  'explos',
  'magic'],
 ['still', 'boom', 'end'],
 ['nope', 'sparki', 'sparki', 'boom', 'man'],
 ['think', 'ive', 'heard', 'chitogheh'],
 ['sokka', 'gloriou', 'bastard'],
 ['best', 'best'],
 ['sokka', 'boomerang', 'firebend', 'mind', 'phrase', 'weird'],
 ['gah', 'cursor', 'omg', 'remov', 'cursor', 'freak'],
 ['poor', 'soul', 'whomst', 'never', 'seen', 

  'preveiw',
  'might',
  'teacher',
  'mean',
  'one',
  'also',
  'definit',
  'same',
  'voic',
  'shamisen',
  'haruhui',
  'agasa',
  'case',
  'close'],
 ['tokoyami',
  'boy',
  'actual',
  'gain',
  'dark',
  'power',
  'never',
  'stop',
  'chuunibyo'],
 ['ask',
  'same',
  'question',
  'tokoyami',
  'megumin',
  'still',
  'chuunibyo',
  'power',
  'claim'],
 ['tsukuyomi',
  'mean',
  'guess',
  'portmanteau',
  'name',
  'word',
  'about',
  'dark',
  'shadow',
  'night'],
 ['tsukuyomi',
  'moon',
  'god',
  'shinto',
  'religion',
  'which',
  'appropri',
  'sinc',
  'tokoyami',
  'power',
  'stronger',
  'night'],
 ['amaz', 'simpl', 'geniu', 'best', 'bird', 'best'],
 ['name',
  'deeper',
  'mean',
  'horikoshi',
  'clever',
  'work',
  'pun',
  'into',
  'name',
  'both',
  'hero',
  'name',
  'character',
  'birth',
  'name',
  'look',
  'pretti',
  'much',
  'character',
  'name',
  'find',
  'mean',
  'tokoyami',
  'instanc',
  'mean',
  'etern',
  'dark'],
 ['man',
  '

  'hope',
  'keep',
  'qualiti'],
 ['might', 'say', 'hyperact', 'child'],
 ['mama', 'midoriya', 'incred', 'cute'],
 ['uchiha', 'iida'],
 ['wasnt', 'onli', 'one', 'got', 'sasuk', 'vibe'],
 ['name', 'definit', 'shouldv', 'mangeky', 'sharingan'],
 ['life', 'goal', 'kill', 'certain', 'man'],
 ['again'],
 ['got', 'like', 'weirdest', 'boner'],
 ['got', 'correctest', 'boner'],
 ['want', 'put', 'dick', 'insid'],
 ['know', 'believ'],
 ['except', 'alien', 'put', 'dick', 'insid'],
 ['long',
  'weird',
  'huge',
  'alien',
  'horsecock',
  'onli',
  'even',
  'more',
  'erect'],
 ['same'],
 ['love',
  'most',
  'class',
  'wasnt',
  'prepar',
  'attent',
  'classic',
  'iida',
  'punctual',
  'think',
  'hori',
  'intent',
  'throw',
  'shipper',
  'bone',
  'exhibit',
  'exhibit',
  'aka',
  'otp',
  'sero',
  'gonna',
  'perman',
  'trigger',
  'everytim',
  'hear',
  'dont',
  'worri',
  'about',
  'surpris',
  'came',
  'asui',
  'didnt',
  'realli',
  'down',
  'super',
  'jokey',
  'person',

  'give',
  'neg',
  'fuck',
  'shota',
  'god',
  'everyth',
  'give',
  'anxieti',
  'tamaki'],
 ['pull', 'word', 'right', 'mouth'],
 ['yeah', 'actual', 'confus', 'second'],
 ['wasnt', 'one'],
 ['nice',
  'see',
  'charact',
  'interact',
  'like',
  'regular',
  'classmat',
  'again',
  'after',
  'awesom',
  'tournament',
  'arc',
  'also',
  'great',
  'charact',
  'like',
  'kirishima',
  'taken',
  'lot',
  'more',
  'serious',
  'befor',
  'hope',
  'get',
  'see',
  'character',
  'internship',
  'progess',
  'befor',
  'realli',
  'scari',
  'shit',
  'gonna',
  'happen',
  'next',
  'arc',
  'mention',
  'previou',
  'episod'],
 ['mayb',
  'read',
  'into',
  'thing',
  'much',
  'found',
  'way',
  'bakugo',
  'hand',
  'whiteboard',
  'back',
  'deku',
  'strang',
  'meaning',
  'total',
  'expect',
  'someth',
  'ridicul',
  'like',
  'throw',
  'board',
  'back',
  'hit',
  'deku',
  'face',
  'instead',
  'calmli',
  'reach',
  'board',
  'back',
  'even',
  'wait',
  '

  'watch',
  'read',
  'through',
  'manga',
  'caught',
  'episod',
  'which',
  'think',
  'episod',
  'season',
  'time',
  'far',
  'manga',
  'actual',
  'read',
  'way',
  'through',
  'now',
  'follow',
  'everi',
  'new',
  'chapter'],
 ['good',
  'alway',
  'awesom',
  'hear',
  'about',
  'newcom',
  'got',
  'into',
  'anim',
  'first',
  'place',
  'there',
  'someth',
  'special',
  'about',
  'first',
  'anim',
  'watch',
  'rare',
  'experi',
  'again',
  'enjoy',
  'everi',
  'second',
  'bnha',
  'perfect',
  'entri',
  'seri',
  'imo',
  'classic',
  'shounen',
  'well',
  'thought',
  'stori',
  'balanc',
  'charact',
  'power',
  'slowli',
  'matur',
  'along',
  'way',
  'becom',
  'more',
  'more',
  'intric',
  'goe',
  'basic',
  'take',
  'good',
  'trope',
  'execut',
  'almost',
  'flawlessli',
  'while',
  'also',
  'own',
  'twist',
  'genr',
  'there',
  'horikoshi',
  'truli',
  'know',
  'great',
  'writer',
  'artist',
  'anoth',
  'anim',
  'except',
 

  'now',
  'though'],
 ['episod', 'season', 'episod', 'total'],
 ['shit', 'there', 'more', 'episod', 'hype'],
 ['prepar', 'hero', 'killer', 'arc', 'mayb', 'end', 'term', 'test', 'arc'],
 ['iida', 'deep', 'end', 'seem'],
 ['keep', 'iida', 'safe'],
 ['love',
  'gran',
  'torino',
  'arent',
  'also',
  'feel',
  'mha',
  'get',
  'even',
  'more',
  'amaz'],
 ['yep', 'fasten', 'hero', 'belt', 'bumpi', 'ride'],
 ['babi',
  'deku',
  'might',
  'onesi',
  'froppi',
  'cutest',
  'name',
  'bakug',
  'angri',
  'oddli',
  'endear',
  'doe',
  'ear',
  'jack',
  'girl',
  'lighten',
  'guy',
  'ship'],
 ['froppi', 'name', 'herself', 'froppi', 'heart', 'cant', 'take'],
 ['deku', 'forev', 'known', 'scrub'],
 ['sugarman',
  'hero',
  'dont',
  'deserv',
  'need',
  'sugarman',
  'wont',
  'bring',
  'back',
  'colour',
  'dream',
  'cant',
  'wait',
  'next',
  'week',
  'see',
  'clint',
  'eastwood',
  'hero',
  'academia',
  'season',
  'cour',
  'preview',
  'week',
  'break',
  'after',
  

 ['awesom',
  'episod',
  'wish',
  'there',
  'enough',
  'materi',
  'get',
  'least',
  'episod'],
 ['urav',
  'realli',
  'awesom',
  'name',
  'made',
  'hype',
  'dont',
  'know',
  'not',
  'even',
  'teacher',
  'someth',
  'stop',
  'iida',
  'mean',
  'pretti',
  'obviou',
  'after',
  'hero',
  'killer',
  'anoth',
  'note',
  'ill',
  'realli',
  'miss',
  'realli',
  'got',
  'use',
  'hope',
  'new',
  'one',
  'will',
  'good',
  'well',
  'plu',
  'ultra'],
 ['seem',
  'like',
  'way',
  'loop',
  'suppos',
  'air',
  'third',
  'season',
  'start',
  'next',
  'week',
  'there',
  'announc',
  'date',
  'suppos',
  'air',
  'still',
  'part',
  'second',
  'season'],
 ['nvm', 'got', 'answer', 'anoth', 'post'],
 ['thi',
  'show',
  'stereoyp',
  'shoen',
  'anim',
  'well',
  'ochaco',
  'preciou',
  'must',
  'protect',
  'midoriya',
  'deku',
  'becaus',
  'made',
  'heart',
  'burst'],
 ['chapter'],
 ['assum', 'watch', 'snippet', 'past', 'pick', 'right'],
 ['doe', 'a

In [82]:
x = "abc.json"
x.split(".")[0]

'abc'