In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import re

In [2]:
df = pd.read_csv('../DataSets/Precily_Text_Similarity.csv')

In [3]:
df.head()

Unnamed: 0,text1,text2
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...


In [4]:
df['text1'][226]

'us adds more jobs than expected the us economy added 337 000 jobs in october - a seven-month high and far more than wall street expectations.  in a welcome economic boost for newly re-elected president george w bush  the labor department figures come after a slow summer of weak jobs gains. jobs were created in every sector of the us economy except manufacturing. while the separate unemployment rate went up to 5.5% from 5.4% in september  this was because more people were now actively seeking work.  the 337 000 new jobs added to us payrolls in october was twice the 169 000 figure that wall street economists had forecast. in addition  the labor department revised up the number of jobs created in the two previous months - to 139 000 in september instead of 96 000  and to 198 000 in august instead of 128 000. the better than expected jobs data had an immediate upward effect on stocks in new york  with the main dow jones index gaining 45.4 points to 10 360 by late morning trading.  it look

In [5]:
df['text2'][226]

'consumer spending lifts us growth us economic growth accelerated in the third quarter  helped by strong consumer spending  official figures have shown.  the economy expanded at an annual rate of 3.7% in the july to september period  the commerce department said. the figure marked an increase on the 3.3% growth recorded in the second quarter  but fell short of the 4.2% rate pencilled in by forecasters. the increase reflected the biggest jump in consumer spending in a year.  it was a little softer than the consensus  but not a real surprise   said gary thayer  an economist at ag edwards & sons. friday s growth estimate is one of the last significant pieces of economic data before the 2 november presidential election.  democrat challenger john kerry has criticised president george w bush s handling of the economy  pointing to a net loss of over 800 000 jobs since mr bush took office. analysts said the economy was still not growing fast enough to stimulate large-scale job creation.  it s 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
text1    3000 non-null object
text2    3000 non-null object
dtypes: object(2)
memory usage: 47.0+ KB


In [7]:
df.isnull().sum()

text1    0
text2    0
dtype: int64

# Text Preprocessing

In [8]:
def non_ascii(text):
    return "".join(i for i in text if ord(i)<128)

In [9]:
def remove_special_chars(text):
    chars = r'[^a-zA-Z0-9.,!?/:;\'\"\s]'
    return re.sub(chars,'',text)

In [10]:
def lower(text):
  return text.lower()

In [11]:
def removeStopWords(str):
#select english stopwords
  cachedStopWords = set(stopwords.words("english"))
#add custom words
  cachedStopWords.update(('and','I','A','http','And','So','arnt','This','When','It','many','Many','so','cant','Yes','yes','No','no','These','these','mailto','regards','ayanna','like','email'))
#remove stop words
  new_str = ' '.join([word for word in str.split() if word not in cachedStopWords]) 
  return new_str

In [12]:
df['new_text1'] = df.text1.apply(func= non_ascii)
df['new_text1'] = df.new_text1.apply(func= remove_special_chars)
df['new_text1'] = df.new_text1.apply(func= removeStopWords)

In [13]:
df['new_text2'] = df.text2.apply(func= non_ascii)
df['new_text2'] = df.new_text2.apply(func= remove_special_chars)
df['new_text2'] = df.new_text2.apply(func= removeStopWords)

## Tokenization and stemming

In [14]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
Port_stem = PorterStemmer()

In [15]:
def token_stemming(text):
    tokens = word_tokenize(text)
    stemmed_tokens = ' '.join([Port_stem.stem(w) for w in tokens])
    return stemmed_tokens

In [16]:
df['new_text1'] = df.new_text1.apply(func= token_stemming)

In [17]:
df['new_text2'] = df.new_text2.apply(func= token_stemming)

In [18]:
df.head()

Unnamed: 0,text1,text2,new_text1,new_text2
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...,broadband challeng tv view number european bro...,garden win doubl glasgow britain jason garden ...
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...,rap boss arrest drug find rap mogul marion sug...,amnesti chief lament war failur lack public ou...
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...,player burnout worri robinson england coach an...,hank greet wintri premier hollywood star tom h...
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...,heart oak 32 cotonsport heart oak set ghanaian...,redford vision sundanc despit sport corduroy c...
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...,sir paul rock super bowl crowd sir paul mccart...,mauresmo open victori la ameli mauresmo maria ...


# Bag of words  
### Count Vectorization
### predicting score using cosine similarity

In [19]:
ans = []
for i in range(len(df)):
    documents = [df['new_text1'][i], df['new_text2'][i]]
    count_vectorizer =CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(documents)
    doc_term_matrix = sparse_matrix.todense()
    dd = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=['x','y'])
    ans.append(cosine_similarity(dd, dd)[0,1])
    

In [None]:
doc_term_matrix

In [20]:
Ans = pd.DataFrame(ans, columns = ['Similarity_Score'])

In [21]:
df = df.join(Ans)

In [22]:
large = df.Similarity_Score.nlargest(10)
large

2820    1.000000
2304    1.000000
114     0.507785
226     0.425008
251     0.413211
2200    0.407152
1725    0.394969
667     0.390789
2113    0.376195
837     0.372825
Name: Similarity_Score, dtype: float64

In [23]:
df.Similarity_Score.nsmallest(10)

1034    0.000000
1052    0.000000
1509    0.000000
1773    0.000000
2102    0.000000
2118    0.000000
2266    0.000000
2357    0.002068
2085    0.002392
89      0.002418
Name: Similarity_Score, dtype: float64

In [24]:
dt = df.drop(columns=['text1','text2'])

In [25]:
dt

Unnamed: 0,new_text1,new_text2,Similarity_Score
0,broadband challeng tv view number european bro...,garden win doubl glasgow britain jason garden ...,0.102075
1,rap boss arrest drug find rap mogul marion sug...,amnesti chief lament war failur lack public ou...,0.033114
2,player burnout worri robinson england coach an...,hank greet wintri premier hollywood star tom h...,0.045743
3,heart oak 32 cotonsport heart oak set ghanaian...,redford vision sundanc despit sport corduroy c...,0.026802
4,sir paul rock super bowl crowd sir paul mccart...,mauresmo open victori la ameli mauresmo maria ...,0.077124
5,india deport bollywood actress india order dep...,foster buy stake winemak australian brewer fos...,0.069907
6,mutant book win guardian prize book evolut mut...,jp morgan admit us slaveri link thousand slave...,0.038455
7,aid climat top davo agenda climat chang fight ...,howl help boost japan cinema japan box offic r...,0.060780
8,kennedi predict bigger turnout voter pent pass...,ocean twelv raid box offic ocean twelv crime c...,0.029035
9,carri star patsi rowland die actress patsi row...,uk broadband get speed inject broadband rapid ...,0.077380


In [26]:
sent1 = "hello... i know your favourite fruit is mango"
sent2 = "hello.. i came to know that mango is your favourite fruit"

In [27]:
def similarity(sent1,sent2):
    sent1_na = non_ascii(sent1)
    sent2_na = non_ascii(sent2)
    sent1_rsc = remove_special_chars(sent1_na)
    sent2_rsc = remove_special_chars(sent2_na)
    low1 = lower(sent1_rsc)
    low2 = lower(sent2_rsc)
    stop1 = removeStopWords(low1)
    stop2 = removeStopWords(low2)
    token_stemming(stop1)
    token_stemming(stop2)
    documents = [sent1,sent2]
    count_vectorizer =CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(documents)
    doc_term_matrix = sparse_matrix.todense()
    dd = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=['x','y'])
    print(cosine_similarity(dd, dd)[0,1])

In [28]:
similarity(sent1,sent2)

0.9128709291752769


In [31]:
sparse_matrix.shape

(2, 229)

In [30]:
doc_term_matrix

matrix([[1, 1, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
         0, 0, 1, 2, 1, 0, 0, 6, 3, 1, 0, 0, 3, 1, 0, 1, 0, 1, 0, 2, 0,
         0, 0, 6, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 4, 1,
         1, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 2, 0, 0, 0, 1, 0, 1, 1, 0, 2, 0, 0, 1, 1, 1, 0, 1, 0, 1,
         1, 0, 2, 3, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 4, 1,
         0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 2, 0, 2, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 2, 0,
         1, 1, 0, 5, 2, 1, 1, 0, 1, 2, 2, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
         5, 2, 1, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0,
         1, 0, 1, 1, 1, 0, 3, 2, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0],
        [0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
         1, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
         2, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 0, 

In [None]:
dd

## Designing web using Flask

In [None]:
from flask import Flask, request, render_template
app = Flask(__name__)


@app.route("/")
def home():
    return render_template('index.html')

@app.route('/get_text', methods=['POST'])

def get_text():
    output = request.form.to_dict()
    sent1 = output["text1"]
    sent2 = output["text2"]
    sent1_na = non_ascii(sent1)
    sent2_na = non_ascii(sent2)
    sent1_rsc = remove_special_chars(sent1_na)
    sent2_rsc = remove_special_chars(sent2_na)
    low1 = lower(sent1_rsc)
    low2 = lower(sent2_rsc)
    stop1 = removeStopWords(low1)
    stop2 = removeStopWords(low2)
    token_stemming(stop1)
    token_stemming(stop2)
    documents = [sent1,sent2]
    count_vectorizer =CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(documents)
    doc_term_matrix = sparse_matrix.todense()
    dd = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=['x','y'])
    sim = (cosine_similarity(dd, dd)[0,1])
    return render_template('index.html',prediction=sim)
    
    
if __name__ == '__main__':
    app.run(debug = True,use_reloader=False)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [07/Aug/2022 14:03:37] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/Aug/2022 14:03:55] "[37mPOST /get_text HTTP/1.1[0m" 200 -
