In [63]:
import pandas as pd
import numpy as np 
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yousef\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yousef\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [64]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os
import string
import copy
import pickle 


In [65]:
title = "20_newsgroups"
os.chdir("D:/20_newsgroups/20_newsgroups")
paths = [] 
for (dirpath, dirnames, filenames) in os.walk(str(os.getcwd())): 
    for i in filenames:  
        paths.append(str(dirpath)+str("\\")+i)
    
    
print(dirpath)

D:\20_newsgroups\20_newsgroups


In [66]:
print(paths[0])

D:\20_newsgroups\20_newsgroups\49960


In [67]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = " "
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
            return np.char.strip(new_text)

In [68]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data  

In [69]:
def convert_lower_case(data):
    return np.char.lower(data)

In [70]:
def stemming(data):
    stemmer= PorterStemmer()

    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
        return np.char.strip(new_text) 

In [71]:
def convert_numbers(data):
    data = np.char.replace(data, "0", " zero ")
    data = np.char.replace(data, "1", " one ")
    data = np.char.replace(data, "2", " two ")
    data = np.char.replace(data, "3", " three ")
    data = np.char.replace(data, "4", " four ")
    data = np.char.replace(data, "5", " five ")
    data = np.char.replace(data, "6", " six ")
    data = np.char.replace(data, "7", " seven ")
    data = np.char.replace(data, "8", " eight ")
    data = np.char.replace(data, "9", " nine ")
    return data

In [72]:
def remove_header(data):
    try:
        ind = data.index('\n\n')
        data = data[ind:]
    except:
         print("No Header")
    return data 


In [73]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [74]:
def remove_single_characters(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
         if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

In [75]:
def preprocess(data,query): 
    data = remove_header(data)
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_stop_words(data)
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = stemming(data)
    return data 

In [76]:
doc = 0
postings = pd.DataFrame()
for path in paths:
    file = open(path, 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    preprocessed_text = preprocess(text, False)

    if doc%100 == 0:
        print(doc)

    tokens = word_tokenize(str(preprocessed_text))
    for token in tokens:
        if token in postings:
            p = postings[token][0]
            p.add(doc)
            postings[token][0] = p 
        else:
            postings.insert(value=[{doc}], loc=0,column=token)
    doc += 1

0
100
200
300
400
500
600
700
800
No Header
900


In [77]:
postings


Unnamed: 0,rh,mc,br,media,start,free,dean,stuff,default,93106,...,bobb,wpr,sandvik,livesey,kmr4,refer,arromde,dmn,articl,archiv
0,{988},{987},"{985, 986}",{982},{979},{976},{971},{967},{961},{941},...,"{295, 43, 14, 15, 16, 155, 989}","{12, 340}","{866, 294, 839, 11, 907}","{386, 324, 325, 838, 10, 842, 332, 367, 368, 3...","{293, 614, 264, 9, 841, 302, 244, 958, 310, 53...","{8, 179, 45}","{6, 745, 330, 13, 920}",{3},"{2, 4, 5, 7, 17, 18, 19, 20, 22, 23, 24, 27, 2...","{0, 1, 900, 901, 203, 204, 205}"


In [78]:
postings["default"][0]


{961}

In [79]:
def generate_command_tokens(query):
    query = query.lower()
    tokens = word_tokenize(query)
    
    commands = []
    query_words = []
    
    for t in tokens:
        if t not in ['and', 'or', 'not']:
            preprocessed_word = preprocess([t], True)
            print(str(preprocessed_word))
            query_words.append(str(preprocessed_word))
        else:
            commands.append(t)
            
    return commands, query_words

In [80]:
def gen_not_tuple(query_words, commands):

    tup = []

    while 'not' in commands:

        i = commands.index('not')

        word = query_words[i]

        word_postings = get_not(word)

        tup.append(word_postings)

        commands.pop(i)

        query_words[i] = i

        print('\nAfter Not Processing:', commands, query_words)

    return tup


def binary_operations(query_words, commands, tup):

    a = postings[query_words[0]][0]

    query_words.pop(0)

    for i in range(len(commands)):

        if type(query_words[i]) == int:

            b = tup.pop(0)
        else:

            b = postings[query_words[i]][0]

        if commands[i] == 'and':

            a = a.intersection(b)
        elif commands[i] == 'or':

            a = a.union(b)
        else:

            print('Invalid Command')

    return a


def execute_query(query):

    (commands, query_words) = generate_command_tokens(query)

    tup = gen_not_tuple(query_words, commands)

    print('\nCommands:', commands)

    print('\nQuery Words:', query_words)

    print('\nTup:', len(tup))

    final_set = binary_operations(query_words, commands, tup)

    print('\nFinal Set:', final_set)

    return final_set


def print_file(file):

    out_file = open(paths[file], 'r', encoding='cp1250')

    out_text = out_file.read()

    print(out_text)

In [81]:
query="exam and not resource"


In [82]:
lists = execute_query(query)


No Header
None
No Header
None

After Not Processing: ['and'] ['None', 1]

Commands: ['and']

Query Words: ['None', 1]

Tup: 1

Final Set: set()
