# Loading the Data

In [141]:
import pandas as pd
import numpy as np
import string
import re

from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [142]:
# You may have to do this to get this project to run
# nltk.download()

In [143]:
train = pd.read_csv('train.csv', names=['target', 'text', 'description'])
test = pd.read_csv('test.csv', names=['target', 'text', 'description'])

train.head()

Unnamed: 0,target,text,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [144]:
def preprocess(data: pd.DataFrame, col: str, inplace: bool = False) -> pd.DataFrame:
    """ Performs the following actions:
            1. Removes all punctionation
            2. Removes stop words and common words
            3. Removes symbols
            4. Removes URLs
            5. Remove numbers
    """
    if not inplace:
        frame = data.copy()
    else:
        frame = data

    text = frame[col]
    text = _remove_url(text)
    text = _remove_non_chars(text)
    text = _remove_common_words(text)
    frame[col] = text
    return frame
    
def _remove_url(column: pd.Series) -> pd.Series:
    """ Removes all URLs from the data """
    url_match = re.compile("https?:\/\/\S+")
    return column.apply(lambda x: re.sub(url_match, " ", x))
    
def _remove_non_chars(column: pd.Series) -> pd.Series:
    """ Removes all non-characters from the string, including punctionation and numbers """
    char_match = re.compile("[^a-z\s\']+")
    return column.apply(lambda x: re.sub(char_match, " ", x.lower()).replace("'", ""))

def _remove_common_words(column: pd.Series) -> pd.Series:
    """ Removes the most common words that appear in the documents """
    vocab = set()
    [vocab.update(v.split()) for v in column]
    vector = CountVectorizer(max_df=0.95)  
    x = vector.fit_transform(column)
    common = vocab - set(vector.get_feature_names())  # Get words that appear more than 95% of the time
    if vocab:
        return _remove_stopwords(column, common)
    else:
        return _remove_stopwords(column)

def _remove_stopwords(column: pd.Series, common: set = None) -> pd.Series:
    """ Removes stopwords """
    words = set(stopwords.words('english'))
    if common:
        words.update(common)
    
    return column.apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in words]))

In [145]:
train = preprocess(train, col='text')
train.head()

Unnamed: 0,target,text,description
0,3,wall st bears claw back black reuters,"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,carlyle looks toward commercial aerospace reuters,Reuters - Private investment firm Carlyle Grou...
2,3,oil economy cloud stocks outlook reuters,Reuters - Soaring crude prices plus worries\ab...
3,3,iraq halts oil exports main southern pipeline ...,Reuters - Authorities have halted oil export\f...
4,3,oil prices soar time record posing new menace ...,"AFP - Tearaway world oil prices, toppling reco..."


In [146]:
test = preprocess(test, col='text')
test.head()

Unnamed: 0,target,text,description
0,3,fears pension talks,Unions representing workers at Turner Newall...
1,4,race second private team sets launch date huma...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,ky company wins grant study peptides ap,AP - A company founded by a chemistry research...
3,4,prediction unit helps forecast wildfires ap,AP - It's barely dawn when Mike Fitzpatrick st...
4,4,calif aims limit farm related smog ap,AP - Southern California's smog-fighting agenc...


In [168]:
def get_top_per_class(data: pd.DataFrame, group: str, col: str, per: int = 1100) -> set:
    """ Gets the top per words from each class and returns the total vocab, in no particular order """
    vocab = set()
    for group, frame in train.groupby(group):
        class_vocab = []
        [class_vocab.extend(word_tokenize(x)) for x in frame[col]]
        freq = FreqDist(class_vocab)
        vocab.update(sorted(freq, key=freq.get)[:per])
    return vocab

In [169]:
top = get_top_per_class(train, 'target', 'text')

(4271,
 {'illness',
  'requirements',
  'investment',
  'hugo',
  'reckless',
  'mattos',
  'fcast',
  'turnaround',
  'tramps',
  'abia',
  'elevens',
  'sunspot',
  'patronage',
  'costing',
  'tutu',
  'vessels',
  'prebiotic',
  'nevada',
  'wyoming',
  'giles',
  'miraculous',
  'strained',
  'throughout',
  'ceremoniously',
  'quitter',
  'okd',
  'expodes',
  'harmison',
  'kayaking',
  'soho',
  'weeklong',
  'levin',
  'timetable',
  'cbc',
  'salad',
  'flutie',
  'adams',
  'reforma',
  'mizuki',
  'irrigated',
  'gulfs',
  'ohios',
  'koalas',
  'berkleigh',
  'dubs',
  'grizzlies',
  'resurgence',
  'hurls',
  'travers',
  'curved',
  'regex',
  'hidayat',
  'seeds',
  'frown',
  'tbilisi',
  'leaped',
  'fragmented',
  'pawnshop',
  'necessarily',
  'powersharing',
  'definite',
  'isolation',
  'modestly',
  'capps',
  'chefs',
  'zvidauri',
  'hug',
  'kilborn',
  'kerri',
  'mania',
  'achievements',
  'dirty',
  'estado',
  'exceeds',
  'zahringer',
  'confront',
  'g