In [45]:
  >>> import nltk
  >>> nltk.download('punkt')
  >>> nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\q4116\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\q4116\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
data = pd.read_csv("./data/dataset.csv", encoding="utf_8")
print(data["Body"])

0      While answering a few of EdChum's questions I ...
1      Being newly created we have zero feeds appeari...
2      It looks like filter coffee has another, diffe...
3      The chatroom name is so bland. "Coffee." Look ...
4      On most SE sites, product recommendations are ...
                             ...                        
229    Indicates that the post shares product or conf...
230    Seasoned Advice has excluded recipe requests f...
231    I appreciate the Seasoned Advice stance and th...
232                                                  NaN
233                                                  NaN
Name: Body, Length: 234, dtype: object


In [66]:
data["Body"][1]

"Being newly created we have zero feeds appearing in our main chat right now. What blogs, news sites, or other important coffee related things should appear in our main chat room's feed? Post your suggestions/submissions.  \n"

In [67]:
stop_words = set(stopwords.words('english'))
def removeStopWords(words):
    res = []
    for word in words:
        if word not in stop_words:
            res.append(word)
    return res

In [68]:
tokenizer = RegexpTokenizer(r"\w+")

In [69]:
removeStopWords(tokenizer.tokenize(data["Body"][1]))

['Being',
 'newly',
 'created',
 'zero',
 'feeds',
 'appearing',
 'main',
 'chat',
 'right',
 'What',
 'blogs',
 'news',
 'sites',
 'important',
 'coffee',
 'related',
 'things',
 'appear',
 'main',
 'chat',
 'room',
 'feed',
 'Post',
 'suggestions',
 'submissions']

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
tfidf = TfidfVectorizer().fit_transform(data["Body"].values.astype('U'))
pairwise_similarity = tfidf * tfidf.T

In [76]:
tfidf

<234x3227 sparse matrix of type '<class 'numpy.float64'>'
	with 15170 stored elements in Compressed Sparse Row format>

In [112]:
arr = pairwise_similarity.toarray()
arr

array([[1.        , 0.05440365, 0.23681405, ..., 0.17617061, 0.        ,
        0.        ],
       [0.05440365, 1.        , 0.01062189, ..., 0.05763601, 0.        ,
        0.        ],
       [0.23681405, 0.01062189, 1.        , ..., 0.06332468, 0.        ,
        0.        ],
       ...,
       [0.17617061, 0.05763601, 0.06332468, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        1.        ]])

In [138]:
np.fill_diagonal(arr, np.nan)
input_idx = 0
arr[input_idx]

array([       nan, 0.05440365, 0.23681405, 0.12030234, 0.08192029,
       0.37610701, 0.05028408, 0.10309927, 0.06338549, 0.0878695 ,
       0.06163498, 0.11214021, 0.03825475, 0.15569057, 0.12604983,
       0.08631264, 0.06311393, 0.14474037, 0.10119141, 0.11252782,
       0.15433065, 0.37020463, 0.03483209, 0.05909022, 0.09488837,
       0.08670355, 0.06534601, 0.05324763, 0.06824562, 0.14051901,
       0.1759459 , 0.05176979, 0.09988282, 0.09321809, 0.11876043,
       0.13996104, 0.06806276, 0.01684564, 0.17288732, 0.04626416,
       0.07668257, 0.10587069, 0.11934832, 0.12764885, 0.19483323,
       0.14571153, 0.120183  , 0.08336829, 0.19248817, 0.12295676,
       0.07133096, 0.04490473, 0.10563782, 0.17413225, 0.18121644,
       0.12531721, 0.17405907, 0.09553886, 0.09169296, 0.        ,
       0.09993586, 0.10761409, 0.09601216, 0.1010435 , 0.09085352,
       0.11103344, 0.11123084, 0.2167124 , 0.14152276, 0.08690805,
       0.2473742 , 0.25444494, 0.11786805, 0.16851959, 0.10831

In [144]:
result_idx = np.nanargmax(arr[input_idx])

print(data["Body"][input_idx])
print("-----------------------------------------------")
print(data["Body"][result_idx])
print(arr[input_idx][result_idx])

While answering a few of EdChum's questions I discovered that what I/we in the USA call pour over coffee is referred to as drip coffee in the UK. I added the pour-over tag to both questions I encountered but figured we should decide as a community which tag to use to describe this brewing process and then properly document it because drip-coffee means something different in the US (which is apparently referred to as filter-cofee in the UK). For clarification the method in question is shown in the image below. 
 

-----------------------------------------------
In the UK and Japan we refer to these:

as drip coffee this is a fairly recent thing to invade UK espresso shops but they have been around for a while in Japan.
Whilst this:

is what we call filter coffee. I'm happy to use both terms but this could turn territorial perhaps as it's unclear what the correct term should be as this is not just a UK-ism.
It may be better to just have a tag synonym in this case perhaps?
Just to defend 

In [140]:
np.argsort(arr[input_idx])

array([233,  59, 201, 108, 100, 232, 169,  93, 209,  86,  37, 130, 217,
        75, 199,  22,  12, 216, 166, 202,  51, 115,  39, 190, 213,   6,
        31, 211,  27,   1, 110,  23, 212, 160, 133, 229, 101, 198, 151,
       165,  79, 197,  10, 159,  16,   8,  26, 164,  36,  84,  28, 116,
       163, 186, 205, 123,  50, 158, 215, 180, 200, 193, 117,  40, 210,
       179, 192,   4, 167,  47,  92, 153, 207, 112,  15,  25,  69, 178,
         9,  64, 132,  58, 170, 194,  33,  89, 220,  87, 106,  24, 174,
        57,  62,  98,  94, 189, 204, 173,  32,  60,  88,  90,  63,  18,
       183,  95,   7, 188, 143,  82, 230, 141,  52,  41, 144, 111,  80,
        61, 131, 185, 206,  74, 208, 152,  65,  66,  11,  19, 175, 227,
       168, 142, 171,  85, 113, 145, 177, 138,  72, 154,  34,  42, 134,
        46, 155,   3,  49, 128, 203, 225,  55, 228, 146,  14, 127, 224,
       218,  43, 222, 109, 221, 137, 219, 172, 129, 161,  78, 223,  35,
        29, 182,  68, 124,  91, 162,  17,  45, 120, 226, 105, 12

In [146]:
arr[input_idx][np.argsort(arr[input_idx])[-2]]

0.3761070117221526

In [147]:
data["Body"][np.argsort(arr[input_idx])[-2]]

"In the UK and Japan we refer to these:\n\nas drip coffee this is a fairly recent thing to invade UK espresso shops but they have been around for a while in Japan.\nWhilst this:\n\nis what we call filter coffee. I'm happy to use both terms but this could turn territorial perhaps as it's unclear what the correct term should be as this is not just a UK-ism.\nIt may be better to just have a tag synonym in this case perhaps?\nJust to defend my corner a little hario is one of the main sellers of these kinds of products and they seem to be the main brand in use in UK and you can see that drip is the term used in their products.\nAnd again here: http://www.hario.co.jp/coffee/hario_coffee/dripdecanter.html although the page is in japanese the url itself uses the term drip.\n"

In [148]:
len(data["Body"])

234

In [152]:
data["Body"][0]

"While answering a few of EdChum's questions I discovered that what I/we in the USA call pour over coffee is referred to as drip coffee in the UK. I added the pour-over tag to both questions I encountered but figured we should decide as a community which tag to use to describe this brewing process and then properly document it because drip-coffee means something different in the US (which is apparently referred to as filter-cofee in the UK). For clarification the method in question is shown in the image below. \n \n"

In [157]:
first = []
second = []
third = []
fourth = []
fifth = []
for index in range(0,len(data["Body"])):
    sim = np.argsort(arr[index])
    first.append([data["Id"][sim[-2]],arr[index][sim[-2]]])
    second.append([data["Id"][sim[-3]],arr[index][sim[-3]]])
    third.append([data["Id"][sim[-4]],arr[index][sim[-4]]])
    fourth.append([data["Id"][sim[-5]],arr[index][sim[-5]]])
    fifth.append([data["Id"][sim[-6]],arr[index][sim[-6]]])
print(first[0])
print(second[0])
print(third[0])
print(fourth[0])
print(fifth[0])

[6, 0.3761070117221526]
[24, 0.3702046312789686]
[78, 0.2544449361830786]
[77, 0.24737419902783314]
[3, 0.236814048261771]


SyntaxError: invalid syntax (<ipython-input-160-e07360384a35>, line 1)

In [161]:
data[["first_Id","first_sim"]]= pd.DataFrame(first)

In [163]:
data[["second_Id","second_sim"]]= pd.DataFrame(second)
data[["third_Id","third_sim"]]= pd.DataFrame(third)
data[["fourth_Id","fourth_sim"]]= pd.DataFrame(fourth)
data[["fifth_Id","fifth_sim"]]= pd.DataFrame(fifth)

In [164]:
data

Unnamed: 0,Id,ParentId,PostTypeId,CreationDate,Title,Body,first_Id,first_sim,second_Id,second_sim,third_Id,third_sim,fourth_Id,fourth_sim,fifth_Id,fifth_sim
0,1,,1,2015-01-27T21:23:05.507,Should we describe the process of brewing a si...,While answering a few of EdChum's questions I ...,6,0.376107,24,0.370205,78,0.254445,77,0.247374,3,0.236814
1,2,,1,2015-01-27T21:26:10.227,What should go in our main chat feeds?,Being newly created we have zero feeds appeari...,1223,0.329802,55,0.170883,1145,0.119142,1225,0.118048,1154,0.106399
2,3,1.0,2,2015-01-27T21:30:20.953,,"It looks like filter coffee has another, diffe...",24,0.307764,1,0.236814,77,0.172507,83,0.161722,1147,0.157606
3,4,,1,2015-01-27T22:31:45.837,Let's think of a creative name for our chatroom,"The chatroom name is so bland. ""Coffee."" Look ...",84,0.194024,49,0.193155,1231,0.187702,75,0.182428,74,0.178000
4,5,,1,2015-01-27T22:34:21.017,Should product recommendations be on-topic?,"On most SE sites, product recommendations are ...",73,0.264509,10,0.230637,15,0.204680,36,0.199814,1154,0.198349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,1236,,4,2020-04-24T13:33:15.850,product-discovery,Indicates that the post shares product or conf...,1235,0.383766,15,0.118638,23,0.118598,1132,0.107572,73,0.103453
230,1237,,1,2020-06-03T08:54:22.113,Do we accept or close recipe requests?,Seasoned Advice has excluded recipe requests f...,1238,0.235186,1146,0.204995,1231,0.204621,1155,0.200373,88,0.197876
231,1238,1237.0,2,2020-06-05T04:50:17.800,,I appreciate the Seasoned Advice stance and th...,1146,0.331666,1147,0.312759,1155,0.310828,1154,0.290588,1169,0.271595
232,1239,,5,2020-07-26T12:16:54.913,,,1240,1.000000,1120,0.000000,92,0.000000,90,0.000000,89,0.000000


In [165]:
data.to_csv("./data/dataset_sklearn_sim.csv", encoding="utf_8")