In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
training_data_filename = "../raw_data/fulltrain.csv"
df_train = pd.read_csv(training_data_filename, names=["label", "document"])

In [15]:
X_train = df_train["document"]
y_train = df_train["label"]

### TF-IDF
Each row is a vector of length = vocab size
* ngram_range=(min, max) e.g. (1, 3) would use unigrams, bigrams and trigrams
* lowercase=True by default
* uses word ngrams by default
* stop_words=None by default, but 'english' can be selected (WARNING: 'english' has some unresolved issues)
* max_df=<float in range [0.0, 1.0]> ignores words with document frequency higher than max_df (setting to range of (0.7, 1.0) fill automatically filter out stopwords based on document frequency)
* min_df is opposite of max_df
* max_features builds vectors only considering top max_features ordered by term frequency

In [32]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.7, max_features=500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [33]:
X_train_tfidf.shape

(48854, 500)

In [34]:
tfidf_vectorizer.get_feature_names_out()

array(['000', '10', '11', '20', '30', 'able', 'about', 'according',
       'across', 'act', 'action', 'actually', 'added', 'administration',
       'after', 'again', 'against', 'ago', 'air', 'al', 'all', 'almost',
       'along', 'already', 'also', 'always', 'am', 'america', 'american',
       'americans', 'among', 'an', 'another', 'anti', 'any', 'anyone',
       'anything', 'appeared', 'are', 'area', 'around', 'article',
       'articles', 'as', 'asked', 'at', 'attack', 'available', 'away',
       'back', 'bank', 'banks', 'based', 'be', 'because', 'become',
       'been', 'before', 'behind', 'being', 'believe', 'best', 'better',
       'between', 'big', 'bill', 'billion', 'book', 'both', 'business',
       'but', 'by', 'call', 'called', 'came', 'campaign', 'can', 'cancer',
       'cant', 'care', 'case', 'cause', 'center', 'central', 'change',
       'children', 'china', 'citizens', 'city', 'claims', 'clear',
       'clinton', 'com', 'come', 'comes', 'coming', 'companies',
       'comp