# Boolean Information Retrieval

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from pprint import pprint


In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [3]:
# Create binary Bag of Words from sentences or use predetermined binary Bag of Words.


In [4]:
DATA = 'sentence'

if DATA == 'sentence':
    # Create binary Bag of Words from sentences
    docs = [
        'Who hates potato',
        'French Fries is made with potato',
        'French Fries usually goes with Cola'
    ]
    vectorizer = CountVectorizer(lowercase=True, stop_words=None, binary=True)

    freqs = vectorizer.fit_transform(docs).toarray()
    words = vectorizer.get_feature_names()
    total_doc = len(docs)

    pprint(freqs)
    pprint(words)
else:
    # Use predetermined binary Bag of Words
    words = ['car', 'auto', 'insurance', 'best']
    freqs = [
        [1, 1, 0, 1],  # doc1
        [1, 1, 1, 0],  # doc2
        [1, 0, 1, 1],  # doc3
    ]
    total_doc = len(freqs)


array([[0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1],
       [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)
['cola',
 'french',
 'fries',
 'goes',
 'hates',
 'is',
 'made',
 'potato',
 'usually',
 'who',
 'with']


In [5]:
df = pd.DataFrame(columns=words, dtype=np.int32)
for i in range(len(freqs)):
    df.loc[i+1] = freqs[i]
df


Unnamed: 0,cola,french,fries,goes,hates,is,made,potato,usually,who,with
1,0,0,0,0,1,0,0,1,0,1,0
2,0,1,1,0,0,1,1,1,0,0,1
3,1,1,1,1,0,0,0,0,1,0,1


In [6]:
def logic_creator(new_col, df, col1, col2, mode='and', col1_not=False, col2_not=False):
    print(
        f'Creating logic -> {"" if col1_not == False else "NOT "}{col1} {"&" if mode == "and" else "|"} {"" if col2_not == False else "NOT "}{col2}')

    sr = pd.Series()
    col1_not = 1 if col1_not == False else 0
    col2_not = 1 if col2_not == False else 0

    for r in df.index:
        if mode == 'and':
            if df[col1].loc[r] == col1_not and df[col2].loc[r] == col2_not:
                sr.loc[r] = 1
            else:
                sr.loc[r] = 0
        elif mode == 'or':
            if df[col1].loc[r] == col1_not or df[col2].loc[r] == col2_not:
                sr.loc[r] = 1
            else:
                sr.loc[r] = 0

    sr = sr.to_frame(new_col)
    df = pd.concat([df, sr], axis=1, sort=False)

    return df


# Query 1 : ('potato' AND NOT 'hates') or 'cola'

In [7]:
df_q1 = logic_creator('subquery1', df, 'potato', 'hates', mode='and', col1_not=False, col2_not=True)
df_q1


Creating logic -> potato & NOT hates
  sr = pd.Series()


Unnamed: 0,cola,french,fries,goes,hates,is,made,potato,usually,who,with,subquery1
1,0,0,0,0,1,0,0,1,0,1,0,0
2,0,1,1,0,0,1,1,1,0,0,1,1
3,1,1,1,1,0,0,0,0,1,0,1,0


In [8]:
df_q1 = logic_creator('result', df_q1, 'subquery1', 'cola', mode='or')
df_q1


Creating logic -> subquery1 | cola
  sr = pd.Series()


Unnamed: 0,cola,french,fries,goes,hates,is,made,potato,usually,who,with,subquery1,result
1,0,0,0,0,1,0,0,1,0,1,0,0,0
2,0,1,1,0,0,1,1,1,0,0,1,1,1
3,1,1,1,1,0,0,0,0,1,0,1,0,1


# Query 2 : ('french' AND 'fries') OR ('cola' AND NOT 'with')

In [9]:
df_q2 = logic_creator('subquery1', df, 'french', 'fries')
df_q2


Creating logic -> french & fries
  sr = pd.Series()


Unnamed: 0,cola,french,fries,goes,hates,is,made,potato,usually,who,with,subquery1
1,0,0,0,0,1,0,0,1,0,1,0,0
2,0,1,1,0,0,1,1,1,0,0,1,1
3,1,1,1,1,0,0,0,0,1,0,1,1


In [10]:
df_q2 = logic_creator('subquery2', df_q2, 'cola', 'with', col2_not=True)
df_q2


Creating logic -> cola & NOT with
  sr = pd.Series()


Unnamed: 0,cola,french,fries,goes,hates,is,made,potato,usually,who,with,subquery1,subquery2
1,0,0,0,0,1,0,0,1,0,1,0,0,0
2,0,1,1,0,0,1,1,1,0,0,1,1,0
3,1,1,1,1,0,0,0,0,1,0,1,1,0


In [11]:
df_q2 = logic_creator('result', df_q2, 'subquery1', 'subquery2', mode='or')
df_q2


Creating logic -> subquery1 | subquery2
  sr = pd.Series()


Unnamed: 0,cola,french,fries,goes,hates,is,made,potato,usually,who,with,subquery1,subquery2,result
1,0,0,0,0,1,0,0,1,0,1,0,0,0,0
2,0,1,1,0,0,1,1,1,0,0,1,1,0,1
3,1,1,1,1,0,0,0,0,1,0,1,1,0,1
