In [6]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format='retina'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import glob
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from fit_generator import label_from_id
import scipy
import util

In [8]:
source_path = '../converted/'

In [9]:
txt_files = glob.glob(source_path + '*')
len(txt_files)

104785

In [10]:
util.tokenize_and_stem("to be or not to be that 1234 2017 is the question skiing-results question question")

['not', 'that', 'the', 'question', 'ski', 'result', 'question', 'question']

In [7]:
count_vectorizer_with_stem = CountVectorizer(
    input='filename',
    stop_words='english',
    tokenizer=util.tokenize_and_stem,
    ngram_range=(1, 1),
#     max_features=10000,
    max_df=0.90,
    min_df=3
)

In [8]:
doc_term = count_vectorizer_with_stem.fit_transform(txt_files)

In [9]:
len(count_vectorizer_with_stem.vocabulary_)

60487

In [10]:
count_vectorizer_with_stem.vocabulary_

{'depp': 13745,
 'true': 55105,
 'rebel': 44271,
 'soul': 50485,
 'cannes': 8367,
 'director': 14378,
 'debut': 13188,
 'hollywood': 23929,
 'johnny': 27341,
 'step': 51330,
 'camera': 8248,
 'film': 18691,
 'share': 48772,
 'act': 480,
 'honour': 24042,
 'marlon': 32839,
 'brando': 7062,
 'love': 31502,
 'story': 51550,
 'brave': 7100,
 'competition': 10958,
 'festival': 18508,
 'saturday': 47375,
 'raphael': 43965,
 'native': 36540,
 'american': 1896,
 'determine': 13932,
 'pull': 43143,
 'family': 18034,
 'hovel': 24236,
 'california': 8159,
 'rubbish': 46378,
 'dump': 15525,
 'maker': 32230,
 'underground': 55899,
 'snuff': 50125,
 'opportunity': 38670,
 'price': 42483,
 'life': 30878,
 'promise': 42819,
 'die': 14175,
 'write': 59335,
 'mind': 34531,
 'dare': 12963,
 'speak': 50642,
 'role': 45999,
 'friend': 19786,
 'mentor': 33889,
 'don': 15004,
 'juan': 27502,
 'volunteer': 57865,
 'day': 13084,
 'shoot': 49079,
 'make': 32227,
 'lot': 31453,
 'money': 35188,
 'catch': 8824,
 

In [11]:
(NUM_ARTICLES, NUM_WORDS) = doc_term.shape
NUM_CATEGORIES = 5

In [12]:
count_vectorizer_with_stem.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'filename',
 'lowercase': True,
 'max_df': 0.9,
 'max_features': None,
 'min_df': 3,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': <function __main__.tokenize_and_stem>,
 'vocabulary': None}

In [13]:
term_frec = doc_term.sum(axis=0)
frec_indexes = np.asarray(term_frec.argsort())[0][::-1]

In [14]:
frecuent_words = np.array(count_vectorizer_with_stem.get_feature_names())[frec_indexes][:1000]
frecuent_words

array(['say', 'year', 'percent', 'government', 'pct', 'bank', 'new',
       'million', 'state', 'minister', 'party', 'month', 'billion',
       'price', 'make', 'tell', 'tax', 'trade', 'win', 'time', 'official',
       'market', 'economic', 'president', 'week', 'country', 'plan',
       'meet', 'election', 'union', 'expect', 'budget', 'day', 'rise',
       'rate', 'lead', 'end', 'bond', 'foreign', 'leader', 'world',
       'issue', 'people', 'european', 'report', 'wednesday', 'second',
       'growth', 'change', 'central', 'house', 'late', 'tuesday', 'rat',
       'add', 'good', 'thursday', 'point', 'high', 'national', 'finance',
       'march', 'early', 'sale', 'rule', 'friday', 'vote', 'cut', 'monday',
       'group', 'right', 'economy', 'policy', 'come', 'newsroom',
       'include', 'work', 'prime', 'bln', 'company', 'inflation',
       'political', 'leave', 'hold', 'low', 'start', 'set', 'unite',
       'labour', 'increase', 'april', 'want', 'member', 'talk', 'fund',
       'curre

In [15]:
count_vectorizer_with_stem.stop_words_

{'tootie',
 'marcoslm',
 'stengths',
 'nzgz',
 'frx',
 'yesdenmark',
 'laidley',
 'janosik',
 'regnier',
 'exaltation',
 'commissionspecial',
 'kraiem',
 'stinnett',
 'santam',
 'expansionsemi',
 'olympicism',
 'ifri',
 'kva',
 'iciec',
 'monthinterbank',
 'arafam',
 'futuremacquarie',
 'flatness',
 'adhoc',
 'barcelone',
 'equipmnent',
 'pervis',
 'catchcries',
 'repnote',
 'williems',
 'montville',
 'alderbrook',
 'sociales',
 'dokata',
 'mondriaanhuis',
 'missouriverbal',
 'grouillard',
 'treanor',
 'tupolov',
 'assaultkoen',
 'betico',
 'superfluously',
 'districtorange',
 'playdown',
 'kennydalglish',
 'candelaria',
 'elkinsf',
 'bernegger',
 'ekalabo',
 'henegouwen',
 'fmaily',
 'valiently',
 'featherbed',
 'tarcisio',
 'rossetlarsson',
 'massimilian',
 'lundestad',
 'moseneke',
 'modulate',
 'overzealously',
 'travaill',
 'hockeyroo',
 'boyolali',
 'mihalis',
 'exner',
 'terriffic',
 'bisguier',
 'toget',
 'centraliastate',
 'bondsbefore',
 'vicentepromoted',
 'projectscompetiti

In [16]:
Y = np.empty((NUM_ARTICLES, NUM_CATEGORIES))
for idx, file in enumerate(txt_files):
    Y[idx] = label_from_id(file)
np.save('y.npy', Y)

In [22]:
scipy.io.mmwrite('countvect-articles', doc_term)

In [19]:
doc_term

<104785x60487 sparse matrix of type '<class 'numpy.int64'>'
	with 10354788 stored elements in Compressed Sparse Row format>