This is the inverted index part of this project.

Some current thoughts
1. read in the json file
2. pre-process the ingredients
   1. this might be a sample one for now, we'll need Jess's clean data. 
3. build the inverted index for ingredient
4. save the inverted index


some of the data requirements:
1. no numbers
2. no stopwords
3. clean "Advertisement"
4. no word such as "with, next, and etc"--are these involved in stop words?
5. no units (maybe) such as cups, pounds, and etc
6. no punctuations
7. tokenization
8. stemm
9. no capital letters (if not necessary)
10. save it as a clean dataset

In [1]:
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string

class InvertedIndex:
    def __init__(self):
        self.DFpostings = {}

    def preProcess(self, text):
        #define stemmer and stopwords
        stemmer = SnowballStemmer("english")
        stop_words = set(stopwords.words('english'))

        #clean and tokenize
        text = text.lower()
        text = ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])
        tokens = word_tokenize(text)
        filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) > 1]

        return filtered_tokens

    def indexFile(self, ingredients, fileId):
        processed_ingredients = self.preProcess(' '.join(ingredients))
        for token in set(processed_ingredients):
            if token in self.DFpostings:
                self.DFpostings[token].append(fileId)
            else:
                self.DFpostings[token] = [fileId]

    def save(self):
        with open("DFPostings.json", "w") as file:
            json.dump(self.DFpostings, file)

#load json data
with open('./sample_data.json', 'r') as file:
    recipes = json.load(file)

#initialize inverted index
index = InvertedIndex()

#index each recipe's ingredients
for recipe_id, recipe_info in recipes.items():
    index.indexFile(recipe_info['ingredients'], recipe_id)

#save the index to a file
index.save()


In [4]:
###this is only a try

import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import string

class InvertedIndex:
    def __init__(self):
        self.DFpostings = {}
        self.recipes = {}  #store full recipe details for retrieval

    def preProcess(self, text):
        #stem
        stemmer = SnowballStemmer("english")
        #stopwords
        stop_words = set(stopwords.words('english'))
        #lower letter
        text = text.lower()
        text = ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])
        #tokenization
        tokens = word_tokenize(text)
        filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) > 1]
        return filtered_tokens

    def indexRecipes(self, recipes):
        for recipe_id, recipe_info in recipes.items():
            processed_ingredients = self.preProcess(' '.join(recipe_info['ingredients']))
            self.recipes[recipe_id] = {'ingredients': recipe_info['ingredients'], 'instructions': recipe_info['instructions']}
            for token in set(processed_ingredients):
                if token in self.DFpostings:
                    self.DFpostings[token].append(recipe_id)
                else:
                    self.DFpostings[token] = [recipe_id]

    def saveIndex(self):
        with open("DFPostings.json", "w") as file:
            json.dump(self.DFpostings, file)
        with open("Recipes.json", "w") as file:
            json.dump(self.recipes, file)

    def search(self, query):
        query_tokens = self.preProcess(query)
        matched_recipes = set()
        for token in query_tokens:
            if token in self.DFpostings:
                matched_recipes.update(self.DFpostings[token])
        return [(self.recipes[recipe_id]['ingredients'], self.recipes[recipe_id]['instructions']) for recipe_id in matched_recipes]

#load and index recipes
with open('./sample_data.json', 'r') as file:
    recipes = json.load(file)
index = InvertedIndex()
index.indexRecipes(recipes)
index.saveIndex()

# user query input
user_input = input("Enter ingredients separated by commas: ")
results = index.search(user_input)

#display results
for ingredients, instructions in results:
    print("Ingredients:", ingredients)
    print("Instructions:", instructions)
    print("\n---\n")


AttributeError: 'InvertedIndex' object has no attribute 'head'