In [45]:
  >>> import nltk
  >>> nltk.download('punkt')
  >>> nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\q4116\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\q4116\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
data = pd.read_csv("./data/dataset.csv", encoding="utf_8")
print(data["Body"])

0      While answering a few of EdChum's questions I ...
1      Being newly created we have zero feeds appeari...
2      It looks like filter coffee has another, diffe...
3      The chatroom name is so bland. "Coffee." Look ...
4      On most SE sites, product recommendations are ...
                             ...                        
229    Indicates that the post shares product or conf...
230    Seasoned Advice has excluded recipe requests f...
231    I appreciate the Seasoned Advice stance and th...
232                                                  NaN
233                                                  NaN
Name: Body, Length: 234, dtype: object


In [66]:
data["Body"][1]

"Being newly created we have zero feeds appearing in our main chat right now. What blogs, news sites, or other important coffee related things should appear in our main chat room's feed? Post your suggestions/submissions.  \n"

In [67]:
stop_words = set(stopwords.words('english'))
def removeStopWords(words):
    res = []
    for word in words:
        if word not in stop_words:
            res.append(word)
    return res

In [68]:
tokenizer = RegexpTokenizer(r"\w+")

In [69]:
removeStopWords(tokenizer.tokenize(data["Body"][1]))

['Being',
 'newly',
 'created',
 'zero',
 'feeds',
 'appearing',
 'main',
 'chat',
 'right',
 'What',
 'blogs',
 'news',
 'sites',
 'important',
 'coffee',
 'related',
 'things',
 'appear',
 'main',
 'chat',
 'room',
 'feed',
 'Post',
 'suggestions',
 'submissions']

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
tfidf = TfidfVectorizer().fit_transform(data["Body"].values.astype('U'))
pairwise_similarity = tfidf * tfidf.T

In [76]:
tfidf

<234x3227 sparse matrix of type '<class 'numpy.float64'>'
	with 15170 stored elements in Compressed Sparse Row format>

In [112]:
arr = pairwise_similarity.toarray()
arr

array([[1.        , 0.05440365, 0.23681405, ..., 0.17617061, 0.        ,
        0.        ],
       [0.05440365, 1.        , 0.01062189, ..., 0.05763601, 0.        ,
        0.        ],
       [0.23681405, 0.01062189, 1.        , ..., 0.06332468, 0.        ,
        0.        ],
       ...,
       [0.17617061, 0.05763601, 0.06332468, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        1.        ]])

In [113]:
np.fill_diagonal(arr, np.nan)
input_idx = 2
arr

array([[       nan, 0.05440365, 0.23681405, ..., 0.17617061, 0.        ,
        0.        ],
       [0.05440365,        nan, 0.01062189, ..., 0.05763601, 0.        ,
        0.        ],
       [0.23681405, 0.01062189,        nan, ..., 0.06332468, 0.        ,
        0.        ],
       ...,
       [0.17617061, 0.05763601, 0.06332468, ...,        nan, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        ,        nan,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
               nan]])

In [91]:
result_idx = np.nanargmax(arr[input_idx])

print(data["Body"][input_idx+1])
print("-----------------------------------------------")
print(data["Body"][result_idx+1])
print(arr[input_idx][result_idx])

The chatroom name is so bland. "Coffee." Look at all the creative names others have thought up:

"Root Access" for Super User
"The DMZ" for Security
"The Renderfarm" for Blender
"The Litter Box" for Pets
"The Hangar" for Aviation
"You Are Here" for Travel
"The Water Cooler" for The Workplace
"The Whiteboard" for Programmers
"The Nineteenth Byte" for Code Golf
etc...

Can we think of a better name for our chatroom?
Only one idea per answer, please. Vote up the ideas that you like!
Stolen from Lifehacks meta, which was in turn stolen from PPCG meta. But that's okay, because I wrote both of those posts too. :P

-----------------------------------------------
"Stackbucks Coffee"
Sorry, that was too tempting... I like the chatty aspect, though.

0.3077635065405421


In [109]:
np.argsort(arr[input_idx])

array([233,  59,  37, 201, 100, 108, 232, 169, 219, 216, 184,  93,  23,
       112,   1,  27, 183, 153, 194, 193,  75, 188,  12, 209,   4,  86,
         6, 217,  31, 212, 165,  47,   9, 177, 189, 160,  38, 123,  60,
        51, 170,  84, 159, 229, 132,  26,  82,  66, 178,  63, 142,  92,
        36,  88, 208, 128, 130,  40, 207, 138,  42, 213, 161,  28,  57,
       151, 190,  64,  32,  56, 202, 200, 133,  15, 158, 221,  89,  91,
        24, 171,  90,  33, 225, 182, 168,  99, 205, 109, 179, 186, 210,
        53, 192,  65, 187,  11,  25,  61,  46,  19, 198,  85, 111, 222,
       176, 228,  55, 211, 113,  79,  50,   7,  69, 122,  13, 141, 167,
        95,  10, 206, 155,  29, 174, 231, 131, 115, 223, 110, 203, 226,
        68, 215, 195,  73, 180, 230, 106, 105, 135, 144, 162, 119,  41,
        34,  16, 116,  96,  14,   8,  20, 164,  62, 173,  39,  97, 120,
        94, 154, 218, 199,  49,  98, 227, 156, 204,  72,  81, 127,  18,
        30, 125, 163,  22,  52, 166, 129,  83, 139,  45, 124, 13