In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# If you haven't already done so, execute:
#import nltk
#nltk.download('punkt')

In [None]:
<h2>Count Vectorizer with Stemming Function</h2>

According to the source code, "if analyzer is used, only the decoder argument is used, as the analyzer is intended to replace the preprocessor, tokenizer, and ngrams steps."

Here, we will use a custom function for the analyzer parameter instead of using 'word'. This results in the token_pattern parameter to be ignored (as per the documentation). This is also true for stop_words.

In [None]:
# Build custom stemmer to use with CountVectorizer
# Used stack overflow post as guide
# https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn
# 20 comments

stemmer = PorterStemmer()
#analyzer = CountVectorizer().build_analyzer()

class CountVectorizerStemmed(CountVectorizer):
    def custom_analyzer(self):
        analyzer = super(CountVectorizerStemmed, self).build_analyzer()
        return lambda text: [stemmer.stem(w) for w in analyzer(text)]

cv_stem = CountVectorizerStemmed(stop_words = 'english', 
                                 lowercase = True, 
                                 min_df=2,
                                token_pattern = r"[a-zA-Z]+")
vector_stem = cv_stem.fit_transform(df_data['jobdescription'])

Including token_pattern parameter here did result in the removal of numbers from features.

In [None]:
vector_stem.shape

In [None]:
print(cv_stem.get_feature_names()[:20])

In [None]:
counts_stem = pd.DataFrame(vector_stem.toarray(), columns = cv_stem.get_feature_names())
counts_stem.head()

It is clear that this did not result in the stemming of any of the tokens

In [None]:
token_pattern = re.search(r"[a-zA-Z]+")    

# list of punctuation
punc = string.punctuation    

#try combining stopwords and punctuation together
user_defined_stop_words = ['st','rd','hong','kong']     
i = nltk.corpus.stopwords.words('english')
j = list(string.punctuation) + user_defined_stop_words

stopwords = set(i).union(j)

# or try this
stop_words=stopwords.words('english')

In [None]:
# 31 comments
# Try using a function

stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

cv_stem2 = CountVectorizer(analyzer=stemmed_words, 
                           stop_words = 'english', 
                           lowercase = True, 
                           min_df=2,
                           token_pattern = r"[a-zA-Z]+")
count_vector2 = cv_stem2.fit_transform(df_data['jobdescription'])

print(cv_stem2.get_feature_names()[:20])

The code above is noticeably slow due to the lambda function.
Including token_pattern parameter here did not result in only word features (numbers are still there).

In [None]:
counts_stem2 = pd.DataFrame(count_vector2.toarray(), columns = cv_stem2.get_feature_names())
counts_stem2.head()