# Loading Libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import pickle
import string
import json
from os.path import exists
import itertools
import collections

### Preproccessing

In [None]:
#Initializing preproccesing tools and a empty dictionary to store processed data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.add(".")
stop_words.add(",")
processed_collection = {}

## Load in Inverted Index

In [None]:
file_to_read = open("inverted_index", "rb")
inverted_index = pickle.load(file_to_read)

## Load in Wiki sample dataset

In [None]:
df = pd.read_csv("C:\\Users\\Josh\\Downloads\\project_1_Wiki_sample.csv")

## Load in AOL Query Log

In [None]:
df2 = pd.read_csv("C:\\Users\\Josh\\Desktop\\cs437p1\\project_1_AOL_query_log\\Clean-Data-01.txt",sep='\t')
df2 = df2.append(pd.read_csv("C:\\Users\\Josh\\Desktop\\cs437p1\\project_1_AOL_query_log\\Clean-Data-02.txt",sep='\t'))
df2 = df2.append(pd.read_csv("C:\\Users\\Josh\\Desktop\\cs437p1\\project_1_AOL_query_log\\Clean-Data-03.txt",sep='\t'))
df2 = df2.append(pd.read_csv("C:\\Users\\Josh\\Desktop\\cs437p1\\project_1_AOL_query_log\\Clean-Data-04.txt",sep='\t'))
df2 = df2.append(pd.read_csv("C:\\Users\\Josh\\Desktop\\cs437p1\\project_1_AOL_query_log\\Clean-Data-05.txt",sep='\t'))
df2["Query"] = df2["Query"].astype(str)
df2["QueryTime"]= pd.to_datetime(df2["QueryTime"])

In [None]:
def getResults(query, tfidf):
    query = [lemmatizer.lemmatize(x.lower()) for x  in word_tokenize(query)]
    list_of_documents = []
    for token in query:
        if token in tfidf.keys():
            list_of_documents.append(set(tfidf[token]))
#     list_of_documents = set(itertools.chain.from_iterable(list_of_documents))
    list_of_documents = list(set.intersection(*map(set,list_of_documents)))
    
    unchecked_tokens = query
    while len(list_of_documents) < 6 and not(len(unchecked_tokens) == 0):
        d = {}
        for token in unchecked_tokens:
            d[token] = len(tfidf[token])
        lowest = min(d, key=d.get)
        unchecked_tokens.pop(list(d.keys()).index(lowest))
        list_of_documents = list(set(list_of_documents).union(set(tfidf[lowest])))
    return list_of_documents, query    

In [None]:
def ranking(clean_query, list_of_cannidates, tfidf):
    total_num_of_docs = len(df)
    term_probs = {}
    for doc in list_of_cannidates:
        term_probs[doc] = 1
        for term in clean_query:
            temp_list = [lemmatizer.lemmatize(x.lower()) for x in word_tokenize(df['content'][doc-1])]
            if temp_list.count(term) == 0:
                term_probs[doc] = term_probs[doc] * 0.00001 * ((0.00001) / total_num_of_docs)
            else:
                term_probs[doc] = term_probs[doc] * (temp_list.count(term) / len(temp_list)) * (len(tfidf[term]) / total_num_of_docs) 
    return term_probs

## Load Flask library and needed modules

In [None]:
from flask import Flask, render_template,request, redirect, request

## The search-engine app

In [None]:
app = Flask(__name__)
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

@app.route('/')
def home():
    return render_template("index.html")

@app.route('/ajax', methods = ['POST'])
def returnHome():
    print("homeeee")
    return redirect("index.html")

@app.route('/suggest', methods = ['POST'])
def suggest():
    """Returns new df with queries that start with the query parameter. Sorted by frequency, excludes exact match."""
    query = request.form['myQuery']
    sg = df2.loc[df2['Query'].str.startswith(query)]
    sg = sg[sg['Query'] != query]
    sg = sg.groupby(['Query']).size().reset_index(name='freq')
    sg = sg.sort_values(by=['freq'],ascending=False)
    data_dict = sg.head(5).to_dict()
    return data_dict['Query']

def build_results(data):
    sorted_dict = dict(sorted(data.items(), key=lambda item: item[1], reverse=True))
    results = {}
    for key in sorted_dict:
        results[key] = {"content":df['content'][key-1], "title":df['title'][key-1]}
    return results

@app.route('/results')
def results(query):
    print("resultsss", query)
    list_of_cannidates, clean_query = getResults(query,inverted_index)
    unsorted_dictionary_of_doc_probs = ranking(clean_query, list_of_cannidates, inverted_index)
    results = build_results(unsorted_dictionary_of_doc_probs)
    return render_template("results.html", query=clean_query, results = results, number_of_results = len(results))

@app.route('/', methods=['POST'])
def getQuery():
    query = request.form['myQuery']
    return results(query)
    
if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5505)