In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

file_paths = [
    'corpus/doc1.txt',
    'corpus/doc2.txt',
    'corpus/doc3.txt',
    'corpus/doc4.txt',
    'corpus/doc5.txt'
]

doc_names = ['doc1', 'doc2', 'doc3', 'doc4', 'doc5']

#Process documents -> stem + token + alpha + stop words
processed_docs = [
    [
        stemmer.stem(word)
        for word in word_tokenize(open(file, encoding='utf-8').read().lower())
        if word.isalpha() and word not in stop_words
    ]
    for file in file_paths
]

#inverted index
inverted_index = {}

for doc_name, words in zip(doc_names, processed_docs):
    for word in set(words):  #duplicates
        if word not in inverted_index:
            inverted_index[word] = []
        inverted_index[word].append(doc_name)

# Sort posting lists
for postings in inverted_index.values():
    postings.sort()

print("Inverted Index:\n")
for term, docs in inverted_index.items():
    print(f'"{term}": {docs}')


Inverted Index:

"carbon": ['doc1']
"emiss": ['doc1']
"acceler": ['doc1']
"due": ['doc1']
"global": ['doc1']
"increas": ['doc1']
"warm": ['doc1']
"photosynthesi": ['doc2']
"sunlight": ['doc2']
"process": ['doc2']
"green": ['doc2']
"plant": ['doc2']
"convert": ['doc2']
"among": ['doc3']
"python": ['doc3']
"scientist": ['doc3']
"data": ['doc3']
"popular": ['doc3']
"versatil": ['doc3']
"languag": ['doc3']
"program": ['doc3']
"begun": ['doc4']
"time": ['doc4']
"best": ['doc4']
"revolut": ['doc4']
"reign": ['doc4']
"pari": ['doc4']
"french": ['doc4']
"chao": ['doc4']
"street": ['doc4']
"worst": ['doc4']
"howev": ['doc5']
"amaz": ['doc5']
"phone": ['doc5']
"batteri": ['doc5']
"camera": ['doc5']
"disappoint": ['doc5']
"condit": ['doc5']
"qualiti": ['doc5']
"life": ['doc5']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
