In [1]:
import os
import pandas as pd
import numpy as np
import re

from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
class TF_IDF:

    corpus_file_paths = None
    corpus_base_file_path = None

    maximum_features = 10000
    minimum_document_frequency = 1
    maximum_document_frequency = .5
    ngram_range = (1,2)

    documents_df = None
    sentences_df = None
    term_occurrences_df = None
    term_weights_df = None
    term_frequency_inverse_document_frequency_df = None

    def __init__(self,
                 corpus_base_file_path,
                 corpus_file_extenions
                 ):

        self.corpus_base_file_path = corpus_base_file_path
        self.corpus_file_extenions = corpus_file_extenions
        self.set_corpus_file_paths_list()

    def set_corpus_file_paths_list(self):

        files = os.listdir(self.corpus_base_file_path)
        paths = []
        for file in files:
            if file.endswith(self.corpus_file_extenions):
                paths.append(file)

        self.corpus_file_paths = files

        return files

    def documents(self, encoding="utf-8"):

        documents = []
        for filename in self.corpus_file_paths:
            with open(os.path.join(self.corpus_base_file_path, filename), 'r', encoding=encoding) as file:
                soup = BeautifulSoup(file.read())
                documents.append({'source': filename, 'text': soup.get_text()})

        dataframe = pd.DataFrame(documents)
        self.documents_df = dataframe

        return self


    def sentences(self, encoding="utf-8"):

        sentences = []
        for filename in self.corpus_file_paths:
            with open(os.path.join(self.corpus_base_file_path, filename), 'r', encoding=encoding) as file:
                soup = BeautifulSoup(file.read())
                for i in sent_tokenize(soup.get_text()):
                    sentences.append({'text': i})

        dataframe = pd.DataFrame(sentences)

        self.sentences_df = dataframe

        return self

    def term_occurrences(self):

        count_vectorizer = CountVectorizer(min_df=self.minimum_document_frequency, max_df=self.maximum_document_frequency, stop_words='english', ngram_range=self.ngram_range)
        count_vectorizer.fit(self.sentences_df.text)
        count_vectorizer_occurrences = count_vectorizer.transform(self.sentences_df.text)
        dataframe = pd.DataFrame({'term': count_vectorizer.get_feature_names(), 'occurrences': np.asarray(count_vectorizer_occurrences.sum(axis=0)).ravel().tolist()})
        self.term_occurrences_df = dataframe

        return self

    def term_weights(self):

        tfidf_vectorizer = TfidfVectorizer(min_df=self.minimum_document_frequency, max_df=self.maximum_document_frequency, stop_words='english', ngram_range=self.ngram_range, max_features=self.maximum_features)
        tfidf_vectorizer_weights = tfidf_vectorizer.fit_transform(self.sentences_df.text.dropna())

        dataframe = pd.DataFrame({'term': tfidf_vectorizer.get_feature_names(), 'weight': np.asarray(tfidf_vectorizer_weights.mean(axis=0)).ravel().tolist()})

        self.term_weights_df = dataframe

        return self

    def term_frequency_inverse_document_frequency(self):
        nl =""" 
        
        """
        dataframe = pd.merge(self.term_weights_df, self.term_occurrences_df, how='left', left_on='term', right_on='term')
        dataframe['documents_in'] = dataframe.term.map(lambda x: ', '.join(list(self.documents_df.loc[self.documents_df['text'].str.contains(x, flags=re.IGNORECASE, regex=True)]['source'])))
        dataframe['sentences_in'] = dataframe['term'].map(lambda x: nl.join(list(self.sentences_df.loc[self.sentences_df['text'].str.contains(x, flags=re.IGNORECASE, regex=True)]['text'])[:4]))

        self.term_frequency_inverse_document_frequency_df = dataframe

        return self

    def run(self):

        self.documents().sentences().term_occurrences().term_weights().term_frequency_inverse_document_frequency()

        return self


In [3]:

    nl =""" """
    configs = {
        "corpus_base_file_path" : "/data/text/",
        "corpus_file_extenions" : ('.txt')
    }

    tf_idf = TF_IDF(**configs)

In [4]:
tf_idf.run()

<__main__.TF_IDF at 0x7fd44d62dd10>

In [5]:
dataframe = tf_idf.term_frequency_inverse_document_frequency_df
dataframe.sort_values(by='weight', ascending=False).head(30)

Unnamed: 0,term,weight,occurrences,documents_in,sentences_in
4672,know,0.011119,52,"doc6.txt, doc5.txt, doc4.txt, doc1.txt, doc3.txt, doc2.txt","As some of you know, Senator Lugar and I recently traveled to Russia, Ukraine, and Azerbaijan to witness firsthand both the progress we're making in securing the world's most dangerous weapons, as well as the serious challenges that lie ahead. \n \n At its height in the late 1980's, this program stockpiled of some of the most dangerous agents known to man - plague, smallpox, and anthrax - to name just a few. \n \n We know these countries want us to fail, and we should remain steadfast in our opposition to their support of terrorism and Iran’s nuclear ambitions. \n \n But we should know that our success in doing so is enhanced by engaging our allies so that we receive the crucial diplomatic, military, intelligence, and financial support that can lighten our load and add legitimacy to our actions."
9412,ve,0.011032,58,"doc6.txt, doc5.txt, doc4.txt, doc1.txt, doc3.txt, doc2.txt","As some of you know, Senator Lugar and I recently traveled to Russia, Ukraine, and Azerbaijan to witness firsthand both the progress we're making in securing the world's most dangerous weapons, as well as the serious challenges that lie ahead. \n \n Now, few people understand these challenges better than the co-founder of the Cooperative Threat Reduction Program, Dick Lugar, and this is something that became particularly clear to me during one incident on the trip. \n \n We entered through no fences or discernible security, and once we did, we found ourselves in a building with open first-floor windows and padlocks that many of us would not use to secure our own luggage. \n \n Of course, Dick has been there and he has done that, and thanks to the Cooperative Threat Reduction Programs he co-founded with Senator Sam Nunn, we've made amazing progress in finding, securing, and guarding some of the deadliest weapons that were left scattered throughout the former Soviet Union after the Cold War."
374,america,0.011023,54,"doc6.txt, doc5.txt, doc4.txt, doc1.txt, doc3.txt, doc2.txt","But, when I think about what is at stake I am reminded by a quote from the late President Kennedy given in a speech at American University in 1963 about threats posed by the Soviet Union. \n \n Throughout American history, there have been moments that call on us to meet the challenges of an uncertain world, and pay whatever price is required to secure our freedom. \n \n In each case, what has been required to meet the challenges we face has been good judgment and clear vision from our leaders, and a fundamental seriousness and engagement on the part of the American people ? \n \n A few Tuesdays ago, the American people embraced this seriousness with regards to America’s policy in Iraq."
8883,time,0.011013,60,"doc6.txt, doc5.txt, doc4.txt, doc1.txt, doc3.txt, doc2.txt","But this is one story that shows our job is far from finished at a time when demand for these weapons has never been greater. \n \n Today, experts tell us that we're in a race against time to prevent this scenario from unfolding. \n \n But we've all seen how it could take far less time for these weapons to leak out and travel around the world, fueling insurgencies and violent conflicts from Africa to Afghanistan. \n \n Time and time again on the trip, I saw their skill and experience when negotiating with the Russians."
4898,let,0.010482,49,"doc6.txt, doc5.txt, doc1.txt, doc3.txt, doc2.txt","Here in Washington, we saw what happened when just two letters filled with just a few grams of Anthrax were sent to the U.S. Senate. \n \n This was two letters. \n \n My third recommendation - which I'll just touch briefly on and let Senator Lugar talk about in more detail - is that we need to start thinking creatively about some of the next-generation efforts on nuclear, biological, and chemical weapons. \n \n ""Let us not be blind to our differences--but let us also direct attention to our common interests and to the means by which those differences can be resolved...For in the final analysis, our most basic common link is that we all inhabit this small planet."
9830,work,0.010406,45,"doc6.txt, doc5.txt, doc4.txt, doc1.txt, doc3.txt, doc2.txt","Additional steps should also be taken to consolidate and secure dangerous pathogen collections, strengthen bio-reconnaissance networks to provide early warning of bio-attack and natural disease outbreaks, and have our experts work together to develop improved medical countermeasures. \n \n For any of these efforts that I've mentioned to work as we move forward, we must also think critically and strategically about Washington's relationship with Moscow. \n \n It's important for senior officials to go and visit these sites, to check their progress and shortcomings; to see what's working and what's not. \n \n And we need to work together to obtain a bilateral agreement on biological threat reduction."
1851,country,0.010026,60,"doc5.txt, doc4.txt, doc1.txt, doc3.txt, doc2.txt","That policy-by-slogan will no longer pass as an acceptable form of debate in this country. \n \n These are serious times for our country, and with their votes two weeks ago, Americans demanded a feasible strategy with defined goals in Iraq ? \n \n A year that is ending with an attempt by the bipartisan Iraq Study Group to determine what can be done about a country that is quickly spiraling out of control. \n \n To reach such a solution, we must communicate clearly and effectively to the factions in Iraq that the days of asking, urging, and waiting for them to take control of their own country are coming to an end."
6276,people,0.009347,68,"doc6.txt, doc5.txt, doc4.txt, doc1.txt, doc3.txt, doc2.txt","Now, few people understand these challenges better than the co-founder of the Cooperative Threat Reduction Program, Dick Lugar, and this is something that became particularly clear to me during one incident on the trip. \n \n Outstanding career officials who run the Nunn-Lugar program -- people like Col. Jim Reid and Andy Weber who are here this morning -- will be there every step of the way to ensure that U.S. interests are protected. \n \n In each case, what has been required to meet the challenges we face has been good judgment and clear vision from our leaders, and a fundamental seriousness and engagement on the part of the American people ? \n \n A few Tuesdays ago, the American people embraced this seriousness with regards to America’s policy in Iraq."
4335,iraq,0.008319,64,"doc5.txt, doc1.txt, doc3.txt, doc2.txt","A few Tuesdays ago, the American people embraced this seriousness with regards to America’s policy in Iraq. \n \n Iraq is descending into chaos based on ethnic divisions that were around long before American troops arrived. \n \n And a report by our own intelligence agencies has concluded that al Qaeda is successfully using the war in Iraq to recruit a new generation of terrorists for its war on America. \n \n These are serious times for our country, and with their votes two weeks ago, Americans demanded a feasible strategy with defined goals in Iraq ?"
3595,government,0.00811,46,"doc6.txt, doc5.txt, doc4.txt, doc1.txt, doc3.txt, doc2.txt","The answers to these questions will require sustained involvement by the Executive Branch, Congress, non-governmental organizations, and the international community. \n \n And the Kiev story is heading in the right direction - while we were in Ukraine, Dick, through his tireless and personal intervention, was able to achieve a breakthrough with that government, bringing that facility and others under the Cooperative Threat Reduction program. \n \n While the government of Ukraine is making progress here, the limited funding they have means that at the current pace, it will take sixty years to dismantle these weapons. \n \n Dreams of democracy and hopes for a perfect government are now just that ?"
