In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import HashingVectorizer  
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk.data
from sklearn.metrics.pairwise import cosine_similarity
import time
import pandas as pd


data = pd.read_csv("../example/Posts.csv")
filteredData = data.dropna()
filteredData = filteredData.head(100000)

In [2]:
print(len(filteredData))

100000


In [3]:
def preprocessing(document):
    document_tokenized = [word.lower() for word in word_tokenize(document)]
    english_stopwords = stopwords.words('english')
    document_stopwords = [word for word in document_tokenized if word not in english_stopwords]
    english_punctuations = [":",".","\n","/","@", "\\","*","=","^",";","_","|",
                               '"',"' "," '","-",
                               "(",")",",",">","<",
                               "!","?","[","]","+",
                               "&","%","$","#","~","{","}"]
    document_filtered = [word for word in document_stopwords if not word in english_punctuations]
    st = WordNetLemmatizer()
    document_stem = [st.lemmatize(word) for word in document_filtered]
    return document_stem

In [4]:
combinedText = []
for i in filteredData.index:
    combinedText.append(preprocessing(str(filteredData['Title'][i]) + " " + str(filteredData['Body'][i])))
filteredData['tokenized_title'] = combinedText

filteredData.head()

Unnamed: 0,Id,Body,Title,PostTypeId,Tags,CreationDate,tokenized_title
0,337,<p>I am about to build a piece of a project th...,XML Processing in Python,1,<python><xml>,2008-08-02T03:35:55.697,"[xml, processing, python, p, build, piece, pro..."
1,469,<p>I am using the Photoshop's javascript API t...,How can I find the full path to a font from it...,1,<python><macos><fonts><photoshop>,2008-08-02T15:11:16.430,"[find, full, path, font, display, name, mac, p..."
2,502,<p>I have a cross-platform (Python) applicatio...,Get a preview JPEG of a PDF on Windows?,1,<python><windows><image><pdf>,2008-08-02T17:01:58.500,"[get, preview, jpeg, pdf, window, p, cross-pla..."
3,535,<p>I am starting to work on a hobby project wi...,Continuous Integration System for a Python Cod...,1,<python><continuous-integration><extreme-progr...,2008-08-02T18:43:54.787,"[continuous, integration, system, python, code..."
4,594,<p>There are several ways to iterate over a re...,cx_Oracle: How do I iterate over a result set?,1,<python><sql><database><oracle><cx-oracle>,2008-08-03T01:15:08.507,"[cx_oracle, iterate, result, set, p, several, ..."


In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

list_of_token_text = list(sent_to_words(filteredData.tokenized_title))

#tag each document 
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(list_of_token_text)]
modelDoc2Vec = Doc2Vec(documents, vector_size=300, window=1, 
                min_count=3, workers=4, dm=1, alpha = 0.025,
                sample= 0.01, dm_concat = 1, dbow_words = 1)
modelDoc2Vec.save("model_doc2vec_python.model")

In [6]:
modelDoc2Vec.most_similar('csv')

  modelDoc2Vec.most_similar('csv')


[('xlsx', 0.6642962694168091),
 ('setfilename', 0.6494576930999756),
 ('arff', 0.6432133913040161),
 ('tsv', 0.6288117170333862),
 ('ods', 0.6214483380317688),
 ('writerows', 0.610712468624115),
 ('dbf', 0.6064220666885376),
 ('writerow', 0.6027462482452393),
 ('xls', 0.6024604439735413),
 ('fasta', 0.6000521779060364)]

In [9]:
modelWord2Vec = gensim.models.Word2Vec(sg=1,size=300,min_count=25,window=5,alpha=0.025) # use fixed learning rate
modelWord2Vec.build_vocab(combinedText)
modelWord2Vec.train(combinedText,total_examples = modelWord2Vec.corpus_count,epochs = 100)
modelWord2Vec.save("model_word2vec_python.model")

In [10]:
modelWord2Vec.most_similar('csv')

  modelWord2Vec.most_similar('csv')


[('excel', 0.510680079460144),
 ('csv.reader', 0.4865483045578003),
 ('.csv', 0.47325068712234497),
 ('csv.writer', 0.4472508132457733),
 ('dictreader', 0.44071558117866516),
 ('csv.dictreader', 0.4388991594314575),
 ('csv.', 0.4240843951702118),
 ('reader', 0.4123855233192444),
 ('row', 0.40391021966934204),
 ('delimited', 0.4029760956764221)]

In [11]:
title_input = input('insert title: ')

insert title: converting dataframe to csv


In [12]:
body_input = input('insert body: ')

insert body: How to convert a dataframe to csv in python? I am writing a code for my project and I need to convert the final dataframe to csv. Help me with this


In [13]:
test_input = title_input + " " + body_input
test_input = preprocessing(test_input)
print(test_input)

['converting', 'dataframe', 'csv', 'convert', 'dataframe', 'csv', 'python', 'writing', 'code', 'project', 'need', 'convert', 'final', 'dataframe', 'csv', 'help']


In [15]:
test_vector = modelDoc2Vec.infer_vector(test_input)
top_10_similar_cases = modelDoc2Vec.docvecs.most_similar(positive = [test_vector])
top_10_similar_cases

[(40526, 0.7874532341957092),
 (27116, 0.7509338855743408),
 (52061, 0.7443612813949585),
 (37745, 0.7305331826210022),
 (27102, 0.7267551422119141),
 (29642, 0.706734836101532),
 (35566, 0.7056331634521484),
 (45596, 0.7045966982841492),
 (80198, 0.7045884132385254),
 (84972, 0.7025653123855591)]

In [17]:
filteredData.loc[40526]

Id                                                           4341405
Body               <p>I have   data in tab delimited format that ...
Title                  Slice specific characters in CSV using python
PostTypeId                                                         1
Tags                                            <python><csv><numpy>
CreationDate                                 2010-12-03T00:26:52.727
tokenized_title    [slice, specific, character, csv, using, pytho...
Name: 40526, dtype: object

In [16]:
test_vector_w2v = modelWord2Vec.infer_vector(test_input)
top_10_similar_cases_w2v = modelWord2Vec.docvecs.most_similar(positive = [test_vector_w2v])
top_10_similar_cases_w2v

AttributeError: 'Word2Vec' object has no attribute 'infer_vector'

In [80]:
input_vector = pd.DataFrame(test_vector).T
input_vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.010838,0.071314,-0.054018,-0.00636,-0.004932,-0.089266,-0.025502,0.019232,0.054464,-0.063888,...,0.018162,0.02409,0.046676,-0.06126,0.011804,-0.021072,-0.013837,-0.018841,-0.038895,0.010559


In [81]:
def docvec_df(model_docvecs):
    _list = []

    for i in range(len(model_docvecs)):
        test = pd.DataFrame({filteredData['Id'][i]: list(model_docvecs[i])}).T
        _list.append(test)
    vec_features = pd.concat(_list, sort = False)
    return vec_features

vec_features = docvec_df(model.docvecs)
len(vec_features)

100000

In [71]:
feature_df = filteredData.drop(columns = ['PostTypeId', 'CreationDate', 'tokenized_title'])
def combine_df(df1, df2):
    combined_df = feature_df.merge(vec_features, left_on='Id', right_index= True, copy=False)
    return combined_df

combined_df = combine_df(feature_df, vec_features)
combined_df.head(10)

Unnamed: 0,Id,Body,Title,Tags,0,1,2,3,4,5,...,290,291,292,293,294,295,296,297,298,299
0,337,<p>I am about to build a piece of a project th...,XML Processing in Python,<python><xml>,-0.048378,-0.020845,0.059406,0.057848,-0.008877,-0.033015,...,-0.005876,-0.093567,-0.034488,0.090274,0.014248,-0.065532,0.047374,0.002935,-0.086508,0.038541
1,469,<p>I am using the Photoshop's javascript API t...,How can I find the full path to a font from it...,<python><macos><fonts><photoshop>,-0.002558,0.008075,-0.049802,-0.061378,0.049353,-0.008179,...,0.008325,-0.009423,-0.001977,-0.003825,0.023986,-0.017957,0.015383,-0.035583,-0.037617,0.050355
2,502,<p>I have a cross-platform (Python) applicatio...,Get a preview JPEG of a PDF on Windows?,<python><windows><image><pdf>,0.017404,-0.008267,-0.006037,-0.013714,-0.002447,0.006771,...,0.015238,0.027567,0.01305,-0.002594,0.002021,-0.005508,0.004993,-0.011718,-0.022531,-0.00789
3,535,<p>I am starting to work on a hobby project wi...,Continuous Integration System for a Python Cod...,<python><continuous-integration><extreme-progr...,0.020759,-0.021543,0.014009,0.08293,-0.021083,0.061195,...,-0.005847,-0.010222,-0.063366,-0.00781,-0.046421,-0.007288,-0.029695,-0.02349,-0.007652,-0.017613
4,594,<p>There are several ways to iterate over a re...,cx_Oracle: How do I iterate over a result set?,<python><sql><database><oracle><cx-oracle>,0.011252,-0.014213,0.011663,0.016671,-0.006552,0.008019,...,-0.002264,-0.007547,-0.049933,-0.005684,0.008609,0.00877,-0.013789,0.001499,0.004562,-0.010704
5,683,<p>I don't remember whether I was dreaming or ...,Using 'in' to match an attribute of Python obj...,<python><arrays><iteration>,0.007278,0.018393,-0.005057,-0.008562,0.018065,-0.016673,...,-0.018648,-0.045594,0.010066,0.014921,0.02796,-0.048843,0.04751,-0.026387,-0.01307,0.024393
6,742,"<p><a href=""http://www.djangoproject.com/"" rel...",Class views in Django,<python><django><views><oop>,0.058449,0.027613,0.057108,0.011944,-0.044126,-0.024858,...,0.033327,-0.026627,0.011803,0.031923,0.015948,-0.033443,0.016341,-0.052953,0.001644,-0.044841
7,766,<p>I can get Python to work with Postgresql bu...,Python and MySQL,<python><mysql><postgresql><bpgsql>,-0.014341,0.063906,-0.017396,0.012907,0.006384,-0.043317,...,0.067454,0.041959,0.018359,-0.014951,-0.068077,-0.035006,0.014361,0.029623,-0.051724,0.010504
8,773,<p>I haven't been able to find an understandab...,How do I use itertools.groupby()?,<python><itertools>,-0.022084,-0.025721,0.059906,0.02381,-0.019359,0.030699,...,-0.040804,-0.051035,-0.104519,-0.01599,0.072669,0.075164,0.041267,-0.000903,-0.048877,0.019911
9,972,<p>I've read that it is possible to add a meth...,Adding a Method to an Existing Object Instance,<python><oop><methods><monkeypatching>,0.005406,0.001405,0.000752,0.001426,0.003016,0.034402,...,0.007185,-0.014266,-0.004597,0.037441,-0.011545,-0.005326,0.005223,-0.012364,0.01621,0.012479


In [39]:
def cosine_text_similarity(text_vector, combined_df, clean_data_df):

    text_similarity = cosine_similarity(combined_df.iloc[:, 4:], text_vector)
    rec_df = pd.DataFrame({'Id': clean_data_df['Id'], 'similarity': list(text_similarity)})
    rec_df['similarity'] = rec_df['similarity'].apply(lambda x: x[0])
    rec_df = rec_df.sort_values(by = 'similarity', ascending=False).reset_index().loc[0:15]
    rec_df.drop(['index'], axis = 1, inplace = True)

    _ids = []
    for i in rec_df['Id']:
        _ids.append(i)

    #print(_ids)
    _index_values = []
    for i in _ids:
        temp = (str(clean_data_df.loc[clean_data_df['Id'] == f'{i}']['Title']))
        temp2 = temp.split()[0]
        _index_values.append(int(temp2))

    case_list = []
    for j, i in enumerate(_index_values):
        case_list.append(f"{j+1} -- {clean_data_df['Title'][i]}")       

        case_list.append(f"{clean_data_df['Body'][i]}")
        
    
    return(case_list)

In [40]:
text = cosine_text_similarity(input_vector, combined_df, filteredData)

In [41]:
text[0]

'1 -- How to add an xml-stylesheet processing instruction node with Python 2.6 and minidom?'

In [42]:
text[1]

'<p>I\'m creating an XML document using minidom - how do I ensure my resultant XML document contains a stylesheet reference like this:</p>\r\n\r\n<pre><code>&lt;?xml-stylesheet type="text/xsl" href="mystyle.xslt"?&gt;\r\n</code></pre>\r\n\r\n<p>Thanks !</p>\r\n'

In [43]:
text[2]

'2 -- Validating a yaml document in python'

In [44]:
text[3]

"<p>One of the benefits of XML is being able to validate a document against an XSD. YAML doesn't have this feature, so how can I validate that the YAML document I open is in the format expected by my application?</p>\r\n"