# Setup

In [46]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
import glob
from nltk.stem import WordNetLemmatizer

# Data

In [47]:
path =r'./cleaned_data/' # use your path
allFiles = glob.glob(path + "/*.csv")
df = pd.DataFrame()

list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None)
    list_.append(df)

df = pd.concat(list_)
df.reset_index(inplace=True, drop=True)

In [48]:
df.head()

Unnamed: 0,timestamp,category_id,topic_id,topic_title,message_number,message_author,message_text,quoteheader
0,1514379921,57,2653937,16.5K will be the major resistance for BTC ?,1,AICoin_Official,"As you can see, price started a correction imm...",
1,1514380918,57,2653937,16.5K will be the major resistance for BTC ?,2,nokati,And&nbsp; who is this Kuang Ren with such a wi...,[]
2,1514381025,57,2653937,16.5K will be the major resistance for BTC ?,3,fabiorem,"Yes, theres a lot of resistance at this level,...",[]
3,1514382464,57,2653937,16.5K will be the major resistance for BTC ?,4,heringasem,With the current price fluctuations the price ...,[]
4,1514384271,57,2653937,16.5K will be the major resistance for BTC ?,5,CuDoCuDau,You should buy bitcoin,"['<div class=""quoteheader""><a href=""https://bi..."


In [49]:
df.tail()

Unnamed: 0,timestamp,category_id,topic_id,topic_title,message_number,message_author,message_text,quoteheader
41323,1411050454,57,787222,.,2,Hfertig,And back down to 8365 without a short squ...,"['<div class=""quoteheader""><a href=""https://bi..."
41324,1411050918,57,787222,.,3,klee,So what is their next move? Up?,[]
41325,1411051112,57,787222,.,4,Hfertig,Difficult to say. Selling pressure is sti...,"['<div class=""quoteheader""><a href=""https://bi..."
41326,1411075852,57,787222,.,5,molecular,doesn't the same go for the $20 millions ...,"['<div class=""quoteheader""><a href=""https://bi..."
41327,1411075887,57,787222,.,6,grappa_barricata,"If i may add something, a sizable part of the ...",[]


---

# Message Concatenating:

In [50]:
grouped_topics = df.groupby(df.topic_title).groups

In [51]:
dict_topic = {}
for tit in tqdm(grouped_topics.keys()):
    sentence = (df.message_text[grouped_topics[tit]].values)
    dict_topic.update({tit: ''.join(str(v) for v in sentence) })

100%|██████████| 1100/1100 [00:00<00:00, 1849.02it/s]


In [52]:
list_message = []
for i in dict_topic.keys():
    list_message.append(dict_topic[i])

-------------

# LDA

In [53]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')  #match any word characters until it reaches a non-word character, like a space.

In [54]:
raw = list_message[0].lower()
tokens = tokenizer.tokenize(raw)  

# Stop words

In [55]:
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')

In [56]:
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
print(stopped_tokens)

['even', 'saw', 'much', 'impact', 'upcoming', 'fork', 'predicted', '6', '7k', 'easily', 'end', 'year', 'nbsp', 'much', 'news', 'btc', 'lot', 'adoption', 'now', 'fork', 'getting', 'closer', 'everyone', 'dumping', 'alt', 'coins', 'buy', 'btc', 'get', 'free', 'nbsp', 'btg', 'coins', 'nbsp', 'created', 'selling', 'alt', 'coin', 'buying', 'frenzy', 'btc', 'dumped', 'decent', 'amount', 'alt', 'coins', 'last', 'week', 'dropped', '25', '60', '7', 'days', 'across', 'board', 'nbsp', 'sure', 'makes', 'great', 'buying', 'opportunity', 'alt', 'coins', 'nbsp', 'get', 'alts', 'cheaper', 'added', 'value', 'btc', 'buy', 'nbsp', 'btc', '1470', 'since', 'last', 'week', 'nbsp', 'o', 'nbsp', 'nbsp', 'smashed', '5k', 'barrier', 'like', 'didn', 't', 'exist', 'dont', 'see', '6k', 'barrier', 'either', 'kind', 'frenzy', '7k', 'likely', 'wouldn', 't', 'surprised', 'see', 'touch', '8k', 'btc', 'gold', 'thar', 'hills', 'nbsp', 'nbsp', 'd', 'nbsp', '1470', '1', 'week', 'huge', 'massive', 'next', 'step', '7000']


# Our words

In [123]:
remove_words = ['will','s' , 'nbsp', 't', 'com', 'http', 'amp', '1xnk8bc', 'href', 'oto', 'www' ,'m', 'import', 'li', 'b', 'style' , 'font', 'span','hr', 'd' , 'jpg','p']
stopped_tokens = [i for i in stopped_tokens if not i in remove_words]
print(stopped_tokens)

['imageshack', 'us', 'img833', '4225', 'moonan', 'imageshack', 'us', 'img833', '4225', 'moonan', 'imageshack', 'us', 'photo', 'images', '833', 'moonan', 'imageshack', 'us', 'img197', '347', 'cycles', 'imageshack', 'us', 'img197', '347', 'cycles', 'imageshack', 'us', 'photo', 'images', '197', 'cycles', 'imageshack', 'us', 'img685', '9515', 'deadmoonv', 'imageshack', 'us', 'img685', '9515', 'deadmoonv', 'imageshack', 'us', 'photo', 'images', '685', 'deadmoonv', 'dunno', 'let', 'know', 'still', 'didnt', 'right', '111', 'guess', 'going', 'fc07', 'deviantart', 'net', 'fs50', '2009', '257', '9', 'angel_and_devil_love_by_kil4', 'fc07', 'deviantart', 'net', 'fs50', '2009', '257', '9', 'angel_and_devil_love_by_kil4', 'can', 'fall', 'love', 'cheap', 'coinwhat', 'software', 'interface', 'https', 'bitcointalk', 'org', 'index', 'php', 'topic', '6019', '0', 'imageshack', 'us', 'img717', '1008', 'bearwedge', 'imageshack', 'us', 'img717', '1008', 'bearwedge', 'imageshack', 'us', 'photo', 'images', '71

# Stemming

In [60]:
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [69]:
# stem token
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

print(stemmed_tokens)

['even', 'saw', 'much', 'impact', 'upcom', 'fork', 'predict', '6', '7k', 'easili', 'end', 'year', 'much', 'news', 'btc', 'lot', 'adopt', 'now', 'fork', 'get', 'closer', 'everyon', 'dump', 'alt', 'coin', 'buy', 'btc', 'get', 'free', 'btg', 'coin', 'creat', 'sell', 'alt', 'coin', 'buy', 'frenzi', 'btc', 'dump', 'decent', 'amount', 'alt', 'coin', 'last', 'week', 'drop', '25', '60', '7', 'day', 'across', 'board', 'sure', 'make', 'great', 'buy', 'opportun', 'alt', 'coin', 'get', 'alt', 'cheaper', 'ad', 'valu', 'btc', 'buy', 'btc', '1470', 'sinc', 'last', 'week', 'o', 'smash', '5k', 'barrier', 'like', 'didn', 'exist', 'dont', 'see', '6k', 'barrier', 'either', 'kind', 'frenzi', '7k', 'like', 'wouldn', 'surpris', 'see', 'touch', '8k', 'btc', 'gold', 'thar', 'hill', 'd', '1470', '1', 'week', 'huge', 'massiv', 'next', 'step', '7000']


# tokenizing, removing stop words, stemmming

In [100]:
from collections import Counter

In [124]:
texts = []

remove_words = ['will','s' , 'nbsp', 't', 'com', 'http', 'amp', '1xnk8bc', 'href', 'oto', 'www',
                "m", "import",  "import", "http", "li", "b", "style" , "font", "span","hr", "d" , "jpg", "png" ]
map_words = {
    'btc': 'bitcoin',
    'better': 'good'
}
for msg in tqdm(range(len(list_message))):
    raw = list_message[msg].lower()
    tokens = tokenizer.tokenize(raw)
    
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stopped_tokens = [i for i in stopped_tokens if not i in remove_words]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    stemmed_tokens = [i for i in stemmed_tokens if not i in remove_words]
    stemmed_tokens = [i for i in stemmed_tokens if not i in en_stop]
    d = pd.DataFrame({'z': stemmed_tokens})
    stemmed_tokens = d.replace(map_words)['z'].tolist()
    texts.append(stemmed_tokens)

100%|██████████| 1100/1100 [00:34<00:00, 31.79it/s]


In [112]:
map_words = {
    'us': 'bitcoin',
    'better': 'good'
}
d = pd.DataFrame({'z': stemmed_tokens})
d.replace(map_words)

Unnamed: 0,z
0,imageshack
1,bitcoin
2,img833
3,4225
4,moonan
5,imageshack
6,bitcoin
7,img833
8,4225
9,moonan


In [114]:
print(d)

              z
0    imageshack
1            us
2        img833
3          4225
4        moonan
5    imageshack
6            us
7        img833
8          4225
9        moonan
10   imageshack
11           us
12        photo
13         imag
14          833
15       moonan
16   imageshack
17           us
18       img197
19          347
20         cycl
21   imageshack
22           us
23       img197
24          347
25         cycl
26   imageshack
27           us
28        photo
29         imag
..          ...
89           us
90       img717
91         1008
92     bearwedg
93   imageshack
94           us
95       img717
96         1008
97     bearwedg
98   imageshack
99           us
100       photo
101        imag
102         717
103    bearwedg
104      upload
105  imageshack
106          us
107  imageshack
108          us
109         hmm
110        look
111         bad
112       still
113        room
114           1
115           2
116       ralli
117       break
118      though

[119 ro

In [88]:
stemmed_tokens.head()

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
dtype: object

In [116]:
for i in (range(len(texts))):
    print(Counter(texts[i]).most_common(7))
    print('------------'+str(i))

[('bitcoin', 6), ('alt', 5), ('coin', 5), ('buy', 4), ('get', 3), ('week', 3), ('much', 2)]
------------0
[('bitcoin', 79), ('china', 46), ('chines', 41), ('price', 26), ('exchang', 22), ('news', 21), ('peopl', 21)]
------------1
[('bitcoin', 4), ('buy', 4), ('1', 4), ('still', 3), ('wait', 2), ('000', 2), ('sit', 2)]
------------2
[('1', 3), ('real', 3), ('time', 3), ('million', 2), ('like', 2), ('day', 2), ('exchang', 2)]
------------3
[('bitcoin', 8), ('level', 4), ('hit', 3), ('go', 3), ('wil', 2), ('12k', 2), ('today', 2)]
------------4
[('html', 2), ('btctrade', 1), ('gonggao', 1), ('0410', 1), ('reddit', 1), ('r', 1), ('bitcoin', 1)]
------------5
[('bitcoin', 12), ('bear', 10), ('one', 7), ('time', 6), ('buy', 6), ('think', 6), ('way', 5)]
------------6
[('happen', 19), ('fork', 18), ('bitcoin', 14), ('hard', 11), ('1', 11), ('novemb', 10), ('go', 9)]
------------7
[('sinc', 3), ('bitcoin', 2), ('000', 2), ('reach', 2), ('five', 2), ('time', 2), ('around', 2)]
------------8
[('

------------150
[('can', 2), ('ath', 2), ('see', 2), ('day', 2), ('hold', 1), ('1000', 1), ('predict', 1)]
------------151
[('bitcoin', 181), ('price', 90), ('year', 60), ('think', 45), ('2024', 44), ('happen', 44), ('reach', 39)]
------------152
[('price', 40), ('1000', 32), ('bitcoin', 31), ('new', 27), ('floor', 25), ('can', 17), ('go', 14)]
------------153
[('price', 39), ('bitcoin', 34), ('difficulti', 22), ('miner', 17), ('mine', 14), ('hashrat', 12), ('increas', 9)]
------------154
[('price', 26), ('dump', 21), ('bitcoin', 21), ('order', 14), ('just', 12), ('whale', 10), ('see', 10)]
------------155
[('bitcoin', 162), ('price', 160), ('year', 82), ('10k', 56), ('can', 50), ('end', 48), ('think', 46)]
------------156
[('bitcoin', 145), ('price', 76), ('decemb', 48), ('reach', 43), ('10k', 38), ('now', 35), ('year', 31)]
------------157
[('bitcoin', 43), ('price', 26), ('possibl', 23), ('usd', 22), ('year', 21), ('2016', 19), ('think', 16)]
------------158
[('bear', 5), ('hack', 4

[('chart', 2), ('display', 2), ('hello', 1), ('like', 1), ('bitcoinchart', 1), ('wonder', 1), ('global', 1)]
------------807
[('go', 2), ('lower', 2), ('back', 2), ('surviv', 2), ('u', 2), ('buy', 2), ('now', 2)]
------------808
[('bitcoin', 58), ('app', 25), ('peopl', 22), ('like', 21), ('use', 18), ('can', 16), ('fee', 16)]
------------809
[('buy', 11), ('bitcoin', 10), ('friend', 8), ('money', 8), ('now', 7), ('like', 6), ('100', 6)]
------------810
[('bitcoin', 35), ('coin', 25), ('alt', 21), ('can', 13), ('thing', 9), ('buy', 9), ('transact', 9)]
------------811
[('bitcoin', 15), ('market', 10), ('price', 9), ('get', 8), ('time', 8), ('week', 7), ('50', 7)]
------------812
[('999', 1053), ('predict', 36), ('entri', 28), ('valu', 27), ('bitcoin', 24), ('1', 24), ('win', 24)]
------------813
[('big', 6), ('guy', 5), ('short', 3), ('bitcoin', 3), ('300', 2), ('way', 2), ('mani', 2)]
------------814
[('price', 20), ('halv', 14), ('bitcoin', 12), ('mine', 11), ('coin', 11), ('go', 10),

# LDA 

In [102]:
import gensim

In [103]:
import datetime

In [125]:
from gensim import corpora, models

a = datetime.datetime.now()

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

b = datetime.datetime.now()
c = b - a

In [40]:
print(c)

0:00:19.580105


In [126]:
a = datetime.datetime.now()
damodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=20)
b = datetime.datetime.now()
c = b - a

In [127]:
print( c)

0:05:19.709480


In [129]:
from sklearn.externals import joblib
joblib.dump(damodel, 'LDA_model.pkl') 

['LDA_model.pkl']

In [130]:
damodel = joblib.load('LDA_model.pkl') 

In [47]:
["m", "import",  "import", "http", "li", "b", "style" , "font", "span","hr", "d" , "jpg", "png" ]

['m',
 'import',
 'import',
 'http',
 'li',
 'b',
 'style',
 'font',
 'span',
 'hr',
 'd',
 'jpg',
 'png']

In [131]:
for i in damodel.print_topics(num_topics=50, num_words=50):
    print(i)
    print('-----------------')

(0, '0.011*"height" + 0.011*"3em" + 0.011*"line" + 0.010*"size" + 0.009*"1" + 0.007*"crash" + 0.007*"imgur" + 0.006*"24pt" + 0.006*"9660" + 0.004*"bitcoin" + 0.004*"140" + 0.004*"buy" + 0.004*"org" + 0.004*"see" + 0.003*"go" + 0.003*"back" + 0.003*"big" + 0.003*"next" + 0.003*"post" + 0.003*"now" + 0.003*"manag" + 0.003*"gif" + 0.003*"135" + 0.003*"trade" + 0.003*"can" + 0.003*"net" + 0.003*"peopl" + 0.003*"manipul" + 0.003*"qsn" + 0.003*"don" + 0.003*"thread" + 0.003*"time" + 0.003*"tumblr" + 0.002*"120" + 0.002*"guy" + 0.002*"pleas" + 0.002*"media" + 0.002*"mtgox" + 0.002*"play" + 0.002*"like" + 0.002*"coin" + 0.002*"quantsig" + 0.002*"bot" + 0.002*"one" + 0.002*"gox" + 0.002*"ddo" + 0.002*"forum" + 0.002*"just" + 0.002*"bear" + 0.002*"2"')
-----------------
(1, '0.017*"bitcoin" + 0.015*"bitcoinica" + 0.012*"short" + 0.010*"can" + 0.008*"just" + 0.008*"use" + 0.007*"don" + 0.007*"go" + 0.007*"posit" + 0.007*"trade" + 0.007*"long" + 0.007*"market" + 0.006*"like" + 0.006*"usd" + 0.006*

In [132]:
from itertools import chain
# Assigns the topics to the documents in corpus
lda_corpus = damodel[corpus]

# Find the threshold, let's set the threshold to be 1/#clusters,
# To prove that the threshold is sane, we average the sum of all probabilities:
scores = list(chain(*[[score for topic_id,score in topic] for topic in [doc for doc in lda_corpus]]))
threshold = sum(scores)/len(scores)
print (threshold)
print()

cluster1 = [j for i,j in zip(lda_corpus,texts) if i[0][1] > threshold]
# cluster2 = [j for i,j in zip(lda_corpus,texts) if i[1][1] > threshold]
# cluster3 = [j for i,j in zip(lda_corpus,texts) if i[2][1] > threshold]
# cluster4 = [j for i,j in zip(lda_corpus,texts) if i[3][1] > threshold]
# cluster5 = [j for i,j in zip(lda_corpus,texts) if i[4][1] > threshold]

print (cluster1)
# print( cluster2)
# print( cluster3)
# print( cluster4)
# print( cluster5)

0.2743334400563918



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [30]:
print(damodel.print_topics(num_topics=10, num_words=20))

[(0, '0.040*"bitcoin" + 0.012*"peopl" + 0.011*"can" + 0.009*"btc" + 0.008*"use" + 0.008*"like" + 0.007*"money" + 0.007*"just" + 0.007*"think" + 0.006*"get" + 0.006*"coin" + 0.006*"one" + 0.006*"currenc" + 0.006*"make" + 0.006*"don" + 0.005*"valu" + 0.005*"price" + 0.005*"market" + 0.005*"exchang" + 0.005*"go"'), (1, '0.042*"0" + 0.042*"span" + 0.039*"32" + 0.034*"1" + 0.029*"btc" + 0.024*"style" + 0.021*"font" + 0.019*"b" + 0.018*"import" + 0.018*"3em" + 0.018*"height" + 0.018*"size" + 0.018*"line" + 0.016*"36" + 0.012*"http" + 0.012*"li" + 0.011*"12pt" + 0.010*"whaleclub" + 0.009*"co" + 0.009*"hr"'), (2, '0.007*"1" + 0.007*"d" + 0.007*"0" + 0.007*"jpg" + 0.007*"png" + 0.006*"http" + 0.006*"org" + 0.005*"like" + 0.005*"chart" + 0.005*"m" + 0.005*"b" + 0.004*"2" + 0.004*"one" + 0.004*"post" + 0.004*"just" + 0.004*"imgur" + 0.004*"can" + 0.004*"look" + 0.004*"5" + 0.004*"use"'), (3, '0.056*"bitcoin" + 0.050*"price" + 0.015*"think" + 0.014*"year" + 0.014*"can" + 0.014*"now" + 0.013*"go" +

In [134]:
for x in damodel.print_topics(num_topics=5, num_words=15):
    print(x)

(8, '0.094*"999" + 0.073*"9617" + 0.042*"9608" + 0.030*"9604" + 0.030*"9600" + 0.019*"bitcoin" + 0.018*"9616" + 0.018*"9612" + 0.014*"9618" + 0.006*"peopl" + 0.006*"price" + 0.005*"9619" + 0.005*"use" + 0.005*"year" + 0.005*"like"')
(13, '0.069*"o" + 0.036*"bitcoin" + 0.010*"1" + 0.008*"dollar" + 0.007*"million" + 0.006*"like" + 0.006*"can" + 0.005*"world" + 0.005*"money" + 0.005*"peopl" + 0.005*"000" + 0.005*"energi" + 0.005*"bank" + 0.005*"valu" + 0.004*"just"')
(7, '0.040*"bitcoin" + 0.015*"1" + 0.011*"valu" + 0.010*"price" + 0.009*"can" + 0.008*"peopl" + 0.008*"currenc" + 0.007*"million" + 0.007*"like" + 0.007*"money" + 0.007*"use" + 0.006*"just" + 0.006*"think" + 0.006*"market" + 0.005*"one"')
(6, '0.009*"thread" + 0.008*"org" + 0.008*"trade" + 0.008*"post" + 0.007*"bitcoin" + 0.006*"like" + 0.005*"look" + 0.005*"1" + 0.005*"just" + 0.005*"make" + 0.005*"don" + 0.004*"can" + 0.004*"postimg" + 0.004*"time" + 0.004*"know"')
(28, '0.028*"analysi" + 0.016*"english" + 0.016*"index" + 0

In [136]:
damodel.get_document_topics(corpus[1])

[(19, 0.9940147)]

-----------------

In [34]:
list_message[0]

'Even before I saw how much impact the upcoming fork was having I predicted 6-7K easily by end of year.&nbsp; Much more news about BTC and a lot more adoption.      Now with Fork getting closer everyone is dumping alt coins to buy BTC and get the "free"&nbsp; BTG coins.&nbsp; Has created a selling of alt coin and buying frenzy of BTC.               I dumped a decent amount into alt coins in the last week as they had dropped 25-60% in about 7 days across the board.&nbsp; Sure makes for a great buying opportunity of alt coins.&nbsp;      You get alts for cheaper and added value of your BTC to buy them with.&nbsp;           BTC is up $1470 since last week&nbsp; :o&nbsp; &nbsp;Smashed through 5K barrier like it didn\'t exist. I dont see 6K being a barrier either with this kind of frenzy. 7K is likely and I wouldn\'t be surprised at all to see it touch 8K. There is BTC Gold in them thar hills........&nbsp; &nbsp;;D                    &nbsp; 1470 in 1 week is huge, massive.     Next step is 

In [59]:
texts = []
for msg in range(len(list_message)):
    # Convert the document into lowercase
    raw = list_message[msg].lower()
    # Tokenize the document: tokens
    tokens = tokenizer.tokenize(raw)
    
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stopped_tokens = [i for i in stopped_tokens if not i in remove_words]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)
    
    

In [None]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)

# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")

# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])
corpus[0]

In [None]:
# Save the fifth document: doc
doc = corpus[4]

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count 
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)


In [4]:
wnl = WordNetLemmatizer()

In [139]:
print (wnl.lemmatize('go', 'n'))

go


In [45]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
 
ps = PorterStemmer()
 
sentence = "gaming, the gamers play games better slower"
words = word_tokenize(sentence)
 
for word in words:
    print(word + ":" + ps.stem(word))

gaming:game
,:,
the:the
gamers:gamer
play:play
games:game
better:better
slower:slower


In [44]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
print(stemmer.stem("slower"))

slower
