In [1]:
  >>> import nltk
  >>> nltk.download('punkt')
  >>> nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd

[nltk_data] Downloading package punkt to /Users/Marcushsu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Marcushsu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("./data/dataset.csv", encoding="utf_8")
print(data["Body"])

0      While answering a few of EdChum's questions I ...
1      Being newly created we have zero feeds appeari...
2      It looks like filter coffee has another, diffe...
3      The chatroom name is so bland. "Coffee." Look ...
4      On most SE sites, product recommendations are ...
                             ...                        
225    I see no reason why a different rule should ap...
226    This tag is used by staff when sharing concept...
227    Indicates that the post shares product or conf...
228    Seasoned Advice has excluded recipe requests f...
229    I appreciate the Seasoned Advice stance and th...
Name: Body, Length: 230, dtype: object


In [3]:
data["Body"][1]

"Being newly created we have zero feeds appearing in our main chat right now. What blogs, news sites, or other important coffee related things should appear in our main chat room's feed? Post your suggestions/submissions.  \n"

In [4]:
stop_words = set(stopwords.words('english'))
def removeStopWords(words):
    res = []
    for word in words:
        if word not in stop_words:
            res.append(word)
    return res

In [5]:
tokenizer = RegexpTokenizer(r"\w+")

In [6]:
removeStopWords(tokenizer.tokenize(data["Body"][1]))

['Being',
 'newly',
 'created',
 'zero',
 'feeds',
 'appearing',
 'main',
 'chat',
 'right',
 'What',
 'blogs',
 'news',
 'sites',
 'important',
 'coffee',
 'related',
 'things',
 'appear',
 'main',
 'chat',
 'room',
 'feed',
 'Post',
 'suggestions',
 'submissions']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
tfidf = TfidfVectorizer().fit_transform(data["Body"].values.astype('U'))
pairwise_similarity = tfidf * tfidf.T

In [8]:
tfidf

<230x3226 sparse matrix of type '<class 'numpy.float64'>'
	with 15168 stored elements in Compressed Sparse Row format>

In [9]:
arr = pairwise_similarity.toarray()
arr

array([[1.        , 0.05382515, 0.23655154, ..., 0.05851318, 0.10372858,
        0.17422276],
       [0.05382515, 1.        , 0.01048055, ..., 0.02391818, 0.08203518,
        0.0571341 ],
       [0.23655154, 0.01048055, 1.        , ..., 0.02803082, 0.06671451,
        0.06267551],
       ...,
       [0.05851318, 0.02391818, 0.02803082, ..., 1.        , 0.0283705 ,
        0.07303407],
       [0.10372858, 0.08203518, 0.06671451, ..., 0.0283705 , 1.        ,
        0.23424613],
       [0.17422276, 0.0571341 , 0.06267551, ..., 0.07303407, 0.23424613,
        1.        ]])

In [10]:
np.fill_diagonal(arr, np.nan)
input_idx = 0
arr[input_idx]

array([       nan, 0.05382515, 0.23655154, 0.11879252, 0.08102271,
       0.37444259, 0.04930749, 0.10200311, 0.06282552, 0.08667974,
       0.0606944 , 0.11120987, 0.03777115, 0.15400867, 0.12443478,
       0.08563501, 0.0624513 , 0.14357261, 0.09994424, 0.11134547,
       0.15248838, 0.3693389 , 0.0342535 , 0.0580554 , 0.09357334,
       0.08575445, 0.06422833, 0.05225607, 0.06731927, 0.13885924,
       0.1739616 , 0.05078207, 0.09836413, 0.09230341, 0.11707718,
       0.13853585, 0.06696335, 0.01670716, 0.17113751, 0.04550026,
       0.07591267, 0.10484547, 0.11819326, 0.12617622, 0.19284381,
       0.14425826, 0.11871889, 0.08255452, 0.1908147 , 0.12138551,
       0.07048279, 0.04429987, 0.10425811, 0.17247281, 0.17921799,
       0.12446486, 0.17271144, 0.09433897, 0.09083063, 0.        ,
       0.098815  , 0.10651782, 0.09527042, 0.10013588, 0.09006429,
       0.10985279, 0.10983759, 0.21481199, 0.14025234, 0.08622571,
       0.2464738 , 0.25320539, 0.11666824, 0.16696444, 0.10748

In [11]:
result_idx = np.nanargmax(arr[input_idx])

print(data["Body"][input_idx])
print("-----------------------------------------------")
print(data["Body"][result_idx])
print(arr[input_idx][result_idx])

While answering a few of EdChum's questions I discovered that what I/we in the USA call pour over coffee is referred to as drip coffee in the UK. I added the pour-over tag to both questions I encountered but figured we should decide as a community which tag to use to describe this brewing process and then properly document it because drip-coffee means something different in the US (which is apparently referred to as filter-cofee in the UK). For clarification the method in question is shown in the image below. 
 

-----------------------------------------------
In the UK and Japan we refer to these:

as drip coffee this is a fairly recent thing to invade UK espresso shops but they have been around for a while in Japan.
Whilst this:

is what we call filter coffee. I'm happy to use both terms but this could turn territorial perhaps as it's unclear what the correct term should be as this is not just a UK-ism.
It may be better to just have a tag synonym in this case perhaps?
Just to defend 

In [12]:
np.argsort(arr[input_idx])

array([ 59, 100, 199,  93, 207,  86,  37, 129, 215,  75, 197,  22,  12,
       214, 165, 200,  51, 114,  39, 188, 211,   6,  31, 209,  27,   1,
       109,  23, 210, 227, 159, 132, 196, 150, 164, 101,  79, 195,  10,
       158,  16,   8,  26, 163,  36,  28,  84, 115, 162, 184, 203,  50,
       122, 157, 213, 178, 198, 191, 116,  40, 208, 177, 190,   4, 166,
        92,  47, 152, 205, 111,  15,  25, 176,  69,   9, 131,  64,  58,
       168,  89,  33, 192, 106,  87, 218,  24, 172,  57,  62,  98,  94,
       202, 187, 171,  32,  88,  60,  90,  18,  63, 181,  95,   7, 186,
       142,  82, 228, 140,  52,  41, 143,  80, 110, 130,  61, 183, 204,
        74, 206, 151,  66,  65,  11,  19, 173, 167, 225, 141, 169, 112,
        85, 144, 175, 137,  72, 153,  34,  42, 133,  46,   3, 154,  49,
       127, 201, 223, 226, 145,  14,  55, 222, 126, 216,  43, 220, 108,
       219, 136, 170, 217,  78, 160, 128, 221,  35,  29, 180,  68, 123,
        91, 161,  17,  45, 224, 105, 119, 125, 104, 156, 103,  2

In [13]:
arr[input_idx][np.argsort(arr[input_idx])[-2]]

0.37444259447674233

In [14]:
data["Body"][np.argsort(arr[input_idx])[-2]]

"In the UK and Japan we refer to these:\n\nas drip coffee this is a fairly recent thing to invade UK espresso shops but they have been around for a while in Japan.\nWhilst this:\n\nis what we call filter coffee. I'm happy to use both terms but this could turn territorial perhaps as it's unclear what the correct term should be as this is not just a UK-ism.\nIt may be better to just have a tag synonym in this case perhaps?\nJust to defend my corner a little hario is one of the main sellers of these kinds of products and they seem to be the main brand in use in UK and you can see that drip is the term used in their products.\nAnd again here: http://www.hario.co.jp/coffee/hario_coffee/dripdecanter.html although the page is in japanese the url itself uses the term drip.\n"

In [15]:
len(data["Body"])

230

In [16]:
data["Body"][0]

"While answering a few of EdChum's questions I discovered that what I/we in the USA call pour over coffee is referred to as drip coffee in the UK. I added the pour-over tag to both questions I encountered but figured we should decide as a community which tag to use to describe this brewing process and then properly document it because drip-coffee means something different in the US (which is apparently referred to as filter-cofee in the UK). For clarification the method in question is shown in the image below. \n \n"

In [17]:
first = []
second = []
third = []
fourth = []
fifth = []
for index in range(0,len(data["Body"])):
    sim = np.argsort(arr[index])
    first.append([data["Id"][sim[-2]],arr[index][sim[-2]]])
    second.append([data["Id"][sim[-3]],arr[index][sim[-3]]])
    third.append([data["Id"][sim[-4]],arr[index][sim[-4]]])
    fourth.append([data["Id"][sim[-5]],arr[index][sim[-5]]])
    fifth.append([data["Id"][sim[-6]],arr[index][sim[-6]]])
print(first[0])
print(second[0])
print(third[0])
print(fourth[0])
print(fifth[0])

[6, 0.37444259447674233]
[24, 0.3693388984030484]
[78, 0.2532053924311832]
[77, 0.24647379680636444]
[3, 0.23655154133293912]


In [18]:
data[["first_Id","first_sim"]]= pd.DataFrame(first)

In [19]:
data[["second_Id","second_sim"]]= pd.DataFrame(second)
data[["third_Id","third_sim"]]= pd.DataFrame(third)
data[["fourth_Id","fourth_sim"]]= pd.DataFrame(fourth)
data[["fifth_Id","fifth_sim"]]= pd.DataFrame(fifth)

In [20]:
data

Unnamed: 0,Id,ParentId,PostTypeId,CreationDate,Title,Body,first_Id,first_sim,second_Id,second_sim,third_Id,third_sim,fourth_Id,fourth_sim,fifth_Id,fifth_sim
0,1,,1,2015-01-27T21:23:05.507,Should we describe the process of brewing a si...,While answering a few of EdChum's questions I ...,6,0.374443,24,0.369339,78,0.253205,77,0.246474,3,0.236552
1,2,,1,2015-01-27T21:26:10.227,What should go in our main chat feeds?,Being newly created we have zero feeds appeari...,1223,0.329917,55,0.170900,1145,0.118607,1225,0.117643,1154,0.106255
2,3,1.0,2,2015-01-27T21:30:20.953,,"It looks like filter coffee has another, diffe...",24,0.307167,1,0.236552,77,0.172021,83,0.160433,1147,0.156460
3,4,,1,2015-01-27T22:31:45.837,Let's think of a creative name for our chatroom,"The chatroom name is so bland. ""Coffee."" Look ...",84,0.192602,49,0.191323,1231,0.185581,75,0.181131,74,0.176297
4,5,,1,2015-01-27T22:34:21.017,Should product recommendations be on-topic?,"On most SE sites, product recommendations are ...",73,0.263834,10,0.229936,15,0.203908,36,0.198739,1154,0.197365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,1234,1233.0,2,2020-01-24T17:38:53.693,,I see no reason why a different rule should ap...,1233,0.258547,1120,0.217809,49,0.215037,1147,0.212790,1231,0.202433
226,1235,,5,2020-04-24T13:33:15.850,,This tag is used by staff when sharing concept...,1236,0.383453,1221,0.184924,59,0.182827,1143,0.174304,1222,0.171976
227,1236,,4,2020-04-24T13:33:15.850,product-discovery,Indicates that the post shares product or conf...,1235,0.383453,15,0.117611,23,0.117584,1132,0.106515,73,0.102506
228,1237,,1,2020-06-03T08:54:22.113,Do we accept or close recipe requests?,Seasoned Advice has excluded recipe requests f...,1238,0.234246,1146,0.204441,1231,0.202823,1155,0.199431,88,0.196573


In [21]:
data.to_csv("./data/dataset_sklearn_sim.csv", encoding="utf_8")

In [22]:
df = pd.read_csv("./data/dataset_sklearn_sim.csv")
df.to_excel("./data/dataset_sklearn_sim.xlsx")