In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import glob
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from fit_generator import label_from_id
import scipy
import util

In [3]:
source_path = '../converted/'

In [4]:
txt_files = glob.glob(source_path + '199*/*')
len(txt_files)

202399

In [5]:
util.tokenize_and_stem("to be or not to be that 1234 2017 is the question skiing-results question question")

['not', 'that', 'the', 'question', 'ski', 'result', 'question', 'question']

In [6]:
count_vectorizer_with_stem = CountVectorizer(
    input='filename',
    stop_words='english',
    tokenizer=util.tokenize_and_stem,
    ngram_range=(1, 1),
#     max_features=10000,
    max_df=0.90,
    min_df=3
)

In [7]:
doc_term = count_vectorizer_with_stem.fit_transform(txt_files)

In [8]:
len(count_vectorizer_with_stem.vocabulary_)

84168

In [9]:
count_vectorizer_with_stem.vocabulary_

{'lukashenko': 43822,
 'stir': 71505,
 'crowd': 17221,
 'talk': 73476,
 'chernobyl': 13486,
 'president': 59040,
 'alexander': 2133,
 'face': 24683,
 'chorus': 13847,
 'criticism': 17128,
 'abroad': 371,
 'turn': 77045,
 'people': 56530,
 'ecstatic': 22101,
 'tuesday': 76890,
 'exploit': 24481,
 'painful': 55063,
 'memory': 46933,
 'belarus': 7451,
 'world': 82404,
 'war': 80936,
 'nuclear': 52629,
 'disaster': 20113,
 'unite': 78213,
 'state': 71115,
 'european': 24052,
 'official': 53191,
 'russia': 64700,
 'ukraine': 77384,
 'express': 24513,
 'concern': 15514,
 'constitutional': 15909,
 'row': 64336,
 'pit': 57616,
 'parliament': 55565,
 'institution': 35717,
 'year': 82953,
 'old': 53491,
 'leader': 41955,
 'support': 72603,
 'dear': 18366,
 'referendum': 62019,
 'november': 52473,
 'vote': 80552,
 'live': 43119,
 'future': 27632,
 'child': 13644,
 'say': 66031,
 'seek': 67008,
 'approval': 3896,
 'new': 51398,
 'draft': 21159,
 'extend': 24535,
 'term': 74342,
 'tighten': 75236,


In [10]:
(NUM_ARTICLES, NUM_WORDS) = doc_term.shape

In [11]:
count_vectorizer_with_stem.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'filename',
 'lowercase': True,
 'max_df': 0.9,
 'max_features': None,
 'min_df': 3,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': <function util.tokenize_and_stem>,
 'vocabulary': None}

In [12]:
term_frec = doc_term.sum(axis=0)
frec_indexes = np.asarray(term_frec.argsort())[0][::-1]

In [13]:
frecuent_words = np.array(count_vectorizer_with_stem.get_feature_names())[frec_indexes][:1000]
frecuent_words

array(['say', 'year', 'percent', 'government', 'pct', 'bank', 'new',
       'million', 'state', 'minister', 'month', 'party', 'price',
       'billion', 'make', 'tell', 'trade', 'official', 'time', 'tax',
       'win', 'president', 'economic', 'week', 'market', 'country', 'meet',
       'union', 'plan', 'election', 'expect', 'rate', 'budget', 'day',
       'rise', 'end', 'bond', 'lead', 'foreign', 'issue', 'world',
       'people', 'leader', 'wednesday', 'second', 'growth', 'change',
       'report', 'tuesday', 'rat', 'european', 'central', 'late', 'house',
       'good', 'early', 'thursday', 'national', 'high', 'sale', 'add',
       'cut', 'point', 'vote', 'rule', 'bln', 'finance', 'economy',
       'friday', 'monday', 'group', 'newsroom', 'inflation', 'include',
       'come', 'work', 'company', 'policy', 'right', 'prime', 'low',
       'hold', 'political', 'member', 'increase', 'talk', 'set', 'leave',
       'unite', 'start', 'international', 'currency', 'want', 'deficit',
       'f

In [14]:
count_vectorizer_with_stem.stop_words_

{'confederaiton',
 'larco',
 'commnunity',
 'vastrup',
 'buiterprofessor',
 'mitteleuropa',
 'destablised',
 'crownstournament',
 'ardmorelast',
 'bereturned',
 'windesmond',
 'meddlesome',
 'zippo',
 'malcolmson',
 'haywards',
 'sorayuth',
 'andequipping',
 'despuite',
 'optimitic',
 'hinchev',
 'giove',
 'brookvale',
 'acceptbids',
 'pedagogy',
 'despitehaving',
 'yapusham',
 'pelowski',
 'underhandedness',
 'hvidsteen',
 'scheduledsunday',
 'masachusettsre',
 'brainware',
 'dgiv',
 'lesuirely',
 'montmelo',
 'maturityat',
 'telpek',
 'jea',
 'musyoki',
 'antigovernment',
 'lithuanialatvia',
 'turlough',
 'tesobono',
 'abortedtransmission',
 'kliore',
 'trompe',
 'chilblain',
 'mohammedodds',
 'dybkj',
 'lelouis',
 'radosevic',
 'standardisedinternational',
 'saiddoohan',
 'tighther',
 'forecastcommissioned',
 'nympheas',
 'pramilla',
 'ady',
 'quaterly',
 'thanr',
 'lacq',
 'khemani',
 'wagi',
 'rabbitohs',
 'volvos',
 'caminata',
 'statistik',
 'courtay',
 'ccourse',
 'nedzad',
 'e

In [15]:
scipy.io.mmwrite('countvect-articles', doc_term)

In [16]:
doc_term

<202399x84168 sparse matrix of type '<class 'numpy.int64'>'
	with 19996058 stored elements in Compressed Sparse Row format>

In [17]:
(NUM_ARTICLES,NUM_WORDS)

(202399, 84168)

In [18]:
count_vectorizer_with_stem.get_feature_names()

['________________________________________________________________',
 'aaa',
 'aaaa',
 'aaaaa',
 'aaaambac',
 'aaabaa',
 'aaaconnie',
 'aaadated',
 'aaadelivery',
 'aaadtd',
 'aaadue',
 'aaafgic',
 'aaafnma',
 'aaafsa',
 'aaainsurance',
 'aaalong',
 'aaamaturity',
 'aaambia',
 'aaano',
 'aaanotes',
 'aaapermanent',
 'aaasale',
 'aaaund',
 'aaaunder',
 'aaaunderlying',
 'aabpara',
 'aaccess',
 'aachen',
 'aaconfirmed',
 'aad',
 'aadated',
 'aadelivery',
 'aadue',
 'aaeu',
 'aafgic',
 'aafitch',
 'aafli',
 'aafsa',
 'aage',
 'aah',
 'aai',
 'aajkal',
 'aal',
 'aalame',
 'aalborg',
 'aalcohol',
 'aalst',
 'aalto',
 'aaltonen',
 'aam',
 'aama',
 'aamaturity',
 'aambac',
 'aambia',
 'aamerican',
 'aami',
 'aamir',
 'aamodt',
 'aamp',
 'aamulehti',
 'aan',
 'aand',
 'aandewiel',
 'aanz',
 'aapo',
 'aapr',
 'aaqib',
 'aarau',
 'aardvark',
 'aare',
 'aarebrot',
 'aarej',
 'aarhus',
 'aaron',
 'aarp',
 'aart',
 'aartsen',
 'aasale',
 'aase',
 'aashish',
 'aat',
 'aattentionthe',
 'aau',
 'aaund

In [19]:
from sklearn.externals import joblib

In [20]:
joblib.dump(count_vectorizer_with_stem, 'count_vectorizer_with_stem.pkl') 

['count_vectorizer_with_stem.pkl']