In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     D:\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     D:\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     D:\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     D:\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
document1 = "A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all."
document2 = "MALE LATE 50''s AUST Single, tall, prof. Interests: Music, theatre, dining, art, the beach and the environment. Seeking female with similar interests to share concerts, dining etc."
document3 = "GENUINE AND HONEST Hi Im 44 with a good sense of humour, am romantic and love drives, fishing, camping and music. Love my 2 kids. Am looking for a lady with similar interests, aged between 38-45 for friendship/ possible relationship."

In [3]:
docs = [document1, document2, document3]
docs

['A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all.',
 "MALE LATE 50''s AUST Single, tall, prof. Interests: Music, theatre, dining, art, the beach and the environment. Seeking female with similar interests to share concerts, dining etc.",
 'GENUINE AND HONEST Hi Im 44 with a good sense of humour, am romantic and love drives, fishing, camping and music. Love my 2 kids. Am looking for a lady with similar interests, aged between 38-45 for friendship/ possible relationship.']

### Text preprocessing :
- Tokenization
- Lower casing
- Remove stopwords and punctuation
- Part of speech tagging
- Lemmatizing  

In [15]:
def text_preprocessing(corpus) :
    print(corpus)
    # Tokenize the given document
    tokenized = word_tokenize(corpus)
    
    # Lower case all words
    tokenized = [word.lower() for word in tokenized]
    
    # Remove stopwords
    stopwords_en = set(stopwords.words("english"))
    stopwords_en = stopwords_en.union(set(punctuation))
    tokenized_without_sw = [word for word in tokenized if not word in stopwords_en]
    
    # Stemming 
    #porter = PorterStemmer()
    #tokenized_without_sw = [porter.stem(word) for word in tokenized_without_sw]
    
    # POS tagging 
    doc_tagged = pos_tag(tokenized_without_sw)
    
    # Lemmatizer
    wnl = WordNetLemmatizer()
    result = [wnl.lemmatize(word, pos=penn2morphy(tag[:2])) for word, tag in doc_tagged]
    
    return result

In [5]:
def penn2morphy(tag) : 
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[tag]
    except:
        return 'n' # if mapping isn't found, fall back to Noun.

In [6]:
docs_preprocessed = []
for doc in docs : 
    docs_preprocessed.append(text_preprocessing (doc))
    
docs_preprocessed

A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all.
[('professional', 'JJ'), ('business', 'NN'), ('male', 'NN'), ('late', 'JJ'), ('40s', 'CD'), ('6', 'CD'), ('feet', 'NNS'), ('tall', 'JJ'), ('slim', 'JJ'), ('build', 'NN'), ('well', 'RB'), ('groomed', 'VBD'), ('great', 'JJ'), ('personality', 'NN'), ('home', 'NN'), ('owner', 'NN'), ('interests', 'NNS'), ('include', 'VBP'), ('arts', 'NNS'), ('travel', 'JJ'), ('things', 'NNS'), ('good', 'JJ'), ('ringwood', 'NN'), ('area', 'NN'), ('seeking', 'VBG'), ('genuine', 'JJ'), ('female', 'NN'), ('similar', 'JJ'), ('age', 'NN'), ('older', 'JJR'), ('area', 'NN'), ('surrounds', 'NNS'), ('meaningful', 'JJ'), ('long', 'JJ'), ('term', 'NN'), ('rship', 'NN'), ('looking', 'VBG'),

[['professional',
  'business',
  'male',
  'late',
  '40',
  '6',
  'foot',
  'tall',
  'slim',
  'build',
  'well',
  'groom',
  'great',
  'personality',
  'home',
  'owner',
  'interest',
  'include',
  'art',
  'travel',
  'thing',
  'good',
  'ringwood',
  'area',
  'seek',
  'genuine',
  'female',
  'similar',
  'age',
  'old',
  'area',
  'surround',
  'meaningful',
  'long',
  'term',
  'rship',
  'look',
  'forward',
  'hear'],
 ['male',
  'late',
  '50',
  "''",
  'aust',
  'single',
  'tall',
  'prof',
  'interest',
  'music',
  'theatre',
  'din',
  'art',
  'beach',
  'environment',
  'seek',
  'female',
  'similar',
  'interest',
  'share',
  'concert',
  'din',
  'etc'],
 ['genuine',
  'honest',
  'hi',
  'im',
  '44',
  'good',
  'sense',
  'humour',
  'romantic',
  'love',
  'drive',
  'fish',
  'camp',
  'music',
  'love',
  '2',
  'kid',
  'look',
  'lady',
  'similar',
  'interest',
  'age',
  '38-45',
  'friendship/',
  'possible',
  'relationship']]

### Term-document matrix

In [7]:
sum([len(doc) for doc in docs_preprocessed])

88

In [8]:
uniq_terms = set()
for doc in docs_preprocessed :
    uniq_terms.update([term for term in doc])
len(uniq_terms)

69

Construct term-document matrix

In [9]:
doc_term_matrix = {}

for term in uniq_terms :
    doc_term_matrix[term] = []
    for doc in docs_preprocessed : 
        if term in doc :
            doc_term_matrix[term].append(1)
        else :
            doc_term_matrix[term].append(0)
            
doc_term_matrix

{'beach': [0, 1, 0],
 'hi': [0, 0, 1],
 'friendship/': [0, 0, 1],
 'possible': [0, 0, 1],
 'personality': [1, 0, 0],
 'meaningful': [1, 0, 0],
 'well': [1, 0, 0],
 'theatre': [0, 1, 0],
 'camp': [0, 0, 1],
 'tall': [1, 1, 0],
 'art': [1, 1, 0],
 '50': [0, 1, 0],
 'build': [1, 0, 0],
 '44': [0, 0, 1],
 'slim': [1, 0, 0],
 'romantic': [0, 0, 1],
 'seek': [1, 1, 0],
 '2': [0, 0, 1],
 'long': [1, 0, 0],
 'male': [1, 1, 0],
 '40': [1, 0, 0],
 'hear': [1, 0, 0],
 'share': [0, 1, 0],
 'aust': [0, 1, 0],
 'etc': [0, 1, 0],
 'genuine': [1, 0, 1],
 'sense': [0, 0, 1],
 'professional': [1, 0, 0],
 'im': [0, 0, 1],
 'love': [0, 0, 1],
 'drive': [0, 0, 1],
 'fish': [0, 0, 1],
 'honest': [0, 0, 1],
 'rship': [1, 0, 0],
 'similar': [1, 1, 1],
 'single': [0, 1, 0],
 'business': [1, 0, 0],
 'surround': [1, 0, 0],
 'forward': [1, 0, 0],
 'din': [0, 1, 0],
 "''": [0, 1, 0],
 'thing': [1, 0, 0],
 'age': [1, 0, 1],
 'humour': [0, 0, 1],
 'great': [1, 0, 0],
 'groom': [1, 0, 0],
 'home': [1, 0, 0],
 'term':

In [19]:
import numpy as np
query = "Seek AND dining"
query = query.split(" AND ")

query 

['Seek', 'dining']

In [20]:
_query = []
_query.append(text_preprocessing(query[0])[0])
_query.append(text_preprocessing(query[1])[0])
_query

Seek
dining


['seek', 'din']

In [22]:
v1 = np.array(doc_term_matrix[_query[0]])
v2 = np.array(doc_term_matrix[_query[1]])

v3 = v1 & v2
print(v1, v2)
v3

[1 1 0] [0 1 0]


array([0, 1, 0], dtype=int32)

In [23]:
def get_doc_indexes(vector):
    indexes = []
    for i, v in enumerate(vector) :
        if v :
            indexes.append(i)
    return indexes

In [24]:
indexes = get_doc_indexes(v3)

In [25]:
for i in indexes :
    print(docs[i])

MALE LATE 50''s AUST Single, tall, prof. Interests: Music, theatre, dining, art, the beach and the environment. Seeking female with similar interests to share concerts, dining etc.
