In [1]:
import numpy as np
import matplotlib.patches as pat
import matplotlib.pyplot as plt
import pandas as pd
import string
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
FILE_IN = 'data/test.csv'
FILE_OUT = 'data/test_tfidf.npz'

In [3]:
# load the text file
data = pd.read_csv(FILE_IN)
data = np.array(data.values)
X = data[:,0:2] # id & text
Y = data[:,2:] # classes

In [4]:
punctuations = string.punctuation.replace('\'','') + "0123456789"
outtab = "".join([" " for _ in punctuations])
trantab = str.maketrans(punctuations, outtab)
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def clean(s):
    t = s.lower().translate(trantab).split()
    t = [w for w in t if len(w) < 50]
    l = [stemmer.stem(lemmatiser.lemmatize(w)) for w in t]
    return " ".join(l)

print('Cleaning the data...')
XC = [clean(s) for s in X[:,1]]

Cleaning the data...


In [6]:
# import and instantiate TfidfVectorizer (with the default parameters)
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()

# use TreeankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
vect.set_params(tokenizer=tokenizer.tokenize)

# remove English stop words
vect.set_params(stop_words='english')

# include 1-grams and 2-grams
vect.set_params(ngram_range=(1, 2))

# ignore terms that appear in more than 50% of the documents
vect.set_params(max_df=0.5)

# only keep terms that appear in at least 1 documents
vect.set_params(min_df=1)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<bound method TreebankWordTokenizer.tokenize of <nltk.tokenize.treebank.TreebankWordTokenizer object at 0x7efd766edba8>>,
        use_idf=True, vocabulary=None)

In [7]:
vect_out = vect.fit_transform(XC)

In [8]:
import scipy
scipy.sparse.save_npz(FILE_OUT, vect_out)