In [1]:
from IPython.core.display import display, HTML

In [2]:
import os
import re
from pathlib import Path

import pandas as pd
import numpy as np

from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from bs4 import BeautifulSoup

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [4]:
class TF_IDF:

    corpus_file_paths = None
    corpus_base_file_path = None

    maximum_features = 10000
    minimum_document_frequency = 1
    maximum_document_frequency = None
    ngram_range = (1,2)
    stop_words = 'english'
    documents_encoding = 'utf-8'
    documents_in_delimeter = ', '
    sentences_in_delimeter = '\n\n'

    documents_df = None
    sentences_df = None
    term_occurrences_df = None
    term_weights_df = None
    term_frequency_inverse_document_frequency_df = None
    dataframe = None

    def __init__(self,
                 corpus_base_file_path,
                 corpus_file_extenions
                 ):

        self.corpus_base_file_path = corpus_base_file_path

        self.corpus_file_extenions = corpus_file_extenions

        self.set_corpus_file_paths_list()

        self.set_maximum_document_frequency()


    def set_corpus_file_paths_list(self):

        files = os.listdir(self.corpus_base_file_path)

        paths = []

        for file in files:

            if Path(file).suffix in self.corpus_file_extenions:

                paths.append(file)

        self.corpus_file_paths = files

        return files

    def set_maximum_document_frequency(self):

        self.maximum_document_frequency = round(len(self.corpus_file_paths) * .99)

        return self.maximum_document_frequency

    def documents(self):

        documents = []

        for filename in self.corpus_file_paths:

            with open(os.path.join(self.corpus_base_file_path, filename), 'r', encoding=self.documents_encoding) as file:

                soup = BeautifulSoup(file.read())

                documents.append({'source': filename, 'text': soup.get_text()})

        dataframe = pd.DataFrame(documents)

        self.documents_df = dataframe

        return self


    def sentences(self):

        sentences = []

        for filename in self.corpus_file_paths:

            with open(os.path.join(self.corpus_base_file_path, filename), 'r', encoding=self.documents_encoding) as file:

                soup = BeautifulSoup(file.read())

                for i in sent_tokenize(soup.get_text()):

                    sentences.append({'text': i})

        dataframe = pd.DataFrame(sentences)

        self.sentences_df = dataframe

        return self

    def term_occurrences(self):

        count_vectorizer = CountVectorizer(
            min_df=self.minimum_document_frequency,
            max_df=self.maximum_document_frequency,
            stop_words=self.stop_words,
            ngram_range=self.ngram_range
        )

        count_vectorizer.fit(self.sentences_df.text)

        count_vectorizer_occurrences = count_vectorizer.transform(self.sentences_df.text)

        dataframe = pd.DataFrame({'term': count_vectorizer.get_feature_names(), 'occurrences': np.asarray(count_vectorizer_occurrences.sum(axis=0)).ravel().tolist()})
        
        self.term_occurrences_df = dataframe

        return self

    def term_weights(self):

        tfidf_vectorizer = TfidfVectorizer(
            min_df=self.minimum_document_frequency,
            max_df=self.maximum_document_frequency,
            stop_words=self.stop_words,
            ngram_range=self.ngram_range,
            max_features=self.maximum_features
        )

        tfidf_vectorizer_weights = tfidf_vectorizer.fit_transform(self.sentences_df.text.dropna())

        dataframe = pd.DataFrame({'term': tfidf_vectorizer.get_feature_names(), 'weight': np.asarray(tfidf_vectorizer_weights.mean(axis=0)).ravel().tolist()})

        self.term_weights_df = dataframe

        return self

    def term_frequency_inverse_document_frequency(self):

        dataframe = pd.merge(self.term_weights_df, self.term_occurrences_df, how='left', left_on='term', right_on='term')

        self.term_frequency_inverse_document_frequency_df = dataframe

        return self

    def documents_in(self):

        dataframe = self.term_frequency_inverse_document_frequency_df

        dataframe['documents_in'] = dataframe.term.map(lambda x: self.documents_in_delimeter.join(list(self.documents_df.loc[self.documents_df['text'].str.contains(fr'\b{x}\b', flags=re.IGNORECASE, regex=True)]['source'])))

        self.dataframe = dataframe

        return self

    def sentences_in(self):

        dataframe = self.dataframe

        dataframe['sentences_in'] = dataframe.term.map(lambda x: self.sentences_in_delimeter.join(list(self.sentences_df.loc[self.sentences_df['text'].str.contains(fr'\b{x}\b', flags=re.IGNORECASE, regex=True)]['text'])[:4]))

        self.dataframe = dataframe

        return self

    def run(self):

        self.documents()\
            .sentences()\
            .term_occurrences()\
            .term_weights()\
            .term_frequency_inverse_document_frequency()\
            .documents_in()\
            .sentences_in()

        return self

In [5]:
configs = {
    "corpus_base_file_path" : "/data/text",
    "corpus_file_extenions" : ('.txt')
}

tf_idf = TF_IDF(**configs)

In [6]:
dataframe_csv_file_path = os.path.join(tf_idf.corpus_base_file_path, 'dataframe.csv')

if not os.path.exists(dataframe_csv_file_path):

    tf_idf.run()

    dataframe = tf_idf.dataframe

    dataframe.to_csv(dataframe_csv_file_path)

else:

    dataframe = pd.read_csv(dataframe_csv_file_path, index_col=0)

dataframe['sentences_in'] = dataframe.sentences_in.apply(lambda x: str(x).replace("\n","<br>"))

dataframe['term'] = dataframe["term"].map(str) + "(" + dataframe["occurrences"].map(str) + ")"

html = dataframe.dropna().sort_values(by='occurrences', ascending=False).head(500).drop(['weight','occurrences'], axis=1).to_html(escape=False, index=False)


In [7]:
display(HTML(html))

term,documents_in,sentences_in
going(9),"doc5.txt, doc3.txt, doc2.txt","For only through this phased redeployment can we send a clear message to the Iraqi factions that the U.S. is not going to hold together this country indefinitely ? And so going forward, I believe there are strategic lessons to be learned from this as we continue to confront the new threats of this new century. Who said we're going to keep on dreaming, and we're going to keep on building, and we're going to keep on marching, and we're going to keep on working because that's who we are. I've had enough of our kids going to schools where the rats outnumber the computers."
common(8),"doc6.txt, doc4.txt, doc1.txt, doc2.txt","""Let us not be blind to our differences--but let us also direct attention to our common interests and to the means by which those differences can be resolved...For in the final analysis, our most basic common link is that we all inhabit this small planet. In the end, if the people cannot trust their government to do the job for which it exists - to protect them and to promote their common welfare - all else is lost. And that is why, in the shadow of the Old State Capitol, where Lincoln once called on a divided house to stand together, where common hopes and common dreams still, I stand before you today to announce my candidacy for President of the United States. This campaign has to be about reclaiming the meaning of citizenship, restoring our sense of common purpose, and realizing that few obstacles can withstand the power of millions of voices calling for change."
tells(7),"doc1.txt, doc3.txt","But the life of a tall, gangly, self-made Springfield lawyer tells us that a different future is possible. He tells us that there is power in words. He tells us that there is power in conviction. He tells us that there is power in hope."
lost(7),"doc4.txt, doc1.txt, doc2.txt","In the end, if the people cannot trust their government to do the job for which it exists - to protect them and to promote their common welfare - all else is lost. And you will determine the direction of this country in the 21st century - whether the hard work of the many is lost to the selfish desires of a few, or whether you build an open, honest, stronger Kenya where everyone rises together. Today we grieve for the families who have lost loved ones, the hearts that have been broken, and the young lives that could have been. More of you have lost your homes and even more are watching your home values plummet."
dream(7),"doc3.txt, doc2.txt","These are Americans who still dream big dreams -they just sense their leaders have forgotten how. Because we've always fought to bring all of our people under the blanket of the American Dream. In a century just six years old, our faith has been shaken by war and terror, disaster and despair, threats to the middle-class dream, and scandal and corruption in our government. I've often thought about Mireya and her simple dream and all those before her who've shared that dream too."
woman(7),"doc3.txt, doc2.txt","And in the midst of this rally, someone comes up to me and says that there's a woman who'd like to come meet you, and she's traveled a long way and she wants to take a picture and shake your hand. And all of this would have been unremarkable except for the fact that this woman, Marguerite Lewis, was born in 1899 and was 105 years old. And ever since I met this frail, one-hundred-and-five-year-old African-American woman who had found the strength to leave her house and come to a rally because she believed that her voice mattered, I've thought about all she's seen in her life. Four years ago, I stood before you and told you my story - of the brief union between a young man from Kenya and a young woman from Kansas who weren't well-off or well-known, but shared a belief that in America, their son could achieve whatever he put his mind to."
hear(7),"doc3.txt, doc2.txt","It's a time where you can go to any town hall or street corner or coffee shop and hear people express the same anxiety about the future; hear them convey the same uncertainty about the direction we're headed as a country. Whether it's the war or Katrina or their health care or their jobs, you hear people say that we've finally arrived at a moment where something must change. You know, you probably never thought you'd hear this at a Take Back America conference, but Newt Gingrich made a great point a few weeks ago. And next week, we'll also hear about those occasions when he's broken with his party as evidence that he can deliver the change that we need."
judgment(7),"doc5.txt, doc2.txt","They are the soul-trying times our forbearers spoke of, when the ease of complacency and self-interest must give way to the more difficult task of rendering judgment on what is best for the nation and for posterity, and then acting on that judgment ? In each case, what has been required to meet the challenges we face has been good judgment and clear vision from our leaders, and a fundamental seriousness and engagement on the part of the American people ? Senator McCain likes to talk about judgment, but really, what does it say about your judgment when you think George Bush has been right more than ninety percent of the time? If John McCain wants to have a debate about who has the temperament, and judgment, to serve as the next Commander-in-Chief, that's a debate I'm ready to have."
critical(7),"doc6.txt, doc5.txt, doc4.txt","I want to turn to a second critical area: biological weapons threat reduction programs. Rather, the question is what strategies, imperfect though they may be, are most likely to achieve the best outcome in Iraq, one that will ultimately put us on a more effective course to deal with international terrorism, nuclear proliferation, and other critical threats to our security. By redeploying from Iraq to Afghanistan, we will answer NATO’s call for more troops and provide a much-needed boost to this critical fight against terrorism. As a phased redeployment is executed, the majority of the U.S. troops remaining in Iraq should be dedicated to the critical, but less visible roles, of protecting logistics supply points, critical infrastructure, and American enclaves like the Green Zone, as well as acting as a rapid reaction force to respond to emergencies and go after terrorists."
law(7),"doc6.txt, doc4.txt, doc1.txt, doc3.txt","Additionally, in the last few years, we've seen some disturbing trends from Russia itself - the deterioration of democracy and the rule of law, the abuses that have taken place in Chechnya, Russian meddling in the former Soviet Union - that raise serious questions about our relationship. I had just finished three years of work as a community organizer in low-income neighborhoods of Chicago, and was about to enroll in law school. After three years of this work, I went to law school, because I wanted to understand how the law should work for those in need. I became a civil rights lawyer, and taught constitutional law, and after a time, I came to understand that our cherished rights of liberty and equality depend on the active participation of an awakened electorate."
