# Import library 

In [1]:
import os
import re
import numpy as np
from nltk.stem.porter import PorterStemmer
from collections import defaultdict

## Step 1: Collect data and write data to file

In [2]:
path = "C:/Users/HH/OneDrive - Hanoi University of Science and Technology/ML basics/Session1/20news-bydate/"
train_dir = path + "20news-bydate-train"
test_dir = path + "20news-bydate-test"
train_path = "20news-bydate-train"
test_path = "20news-bydate-test"

def gather_20newsgroup_data():
    newsgroup_list = []
    for newsgroup in os.listdir(train_dir):
        newsgroup_list.append(newsgroup)
    newsgroup_list.sort()
    
    with open("20news-bydate/stop_words.txt") as file:
        stop_words = file.read().splitlines()
    ps = PorterStemmer()
    
    def collect_data_from(parent_dir, dir_path, newsgroup_list):
        data = []
        for group_id, newsgroup in enumerate(newsgroup_list):
              for filename in os.listdir(parent_dir + '/' + newsgroup + '/'):     
                    with open("20news-bydate/" + dir_path + '/' + newsgroup + '/' + filename) as f:
                        text = f.read().lower()
                    content = str(group_id) + "<fff>" + filename + "<fff>" 
                    for word in re.split("\W+", text):
                        if word not in stop_words:
                            content += ps.stem(word) + " "
                    data.append(content)
        return data
    train_data = collect_data_from(parent_dir = train_dir, dir_path = train_path, newsgroup_list = newsgroup_list)
    test_data = collect_data_from(parent_dir = test_dir, dir_path = test_path, newsgroup_list = newsgroup_list)
    full_data = train_data + test_data
    with open ("20news-bydate/20news-train-processed.txt", "w") as f:
        f.write('\n'.join(train_data))
    with open ("20news-bydate/20news-test-processed.txt", "w") as f:
        f.write('\n'.join(test_data))
    with open ("20news-bydate/20news-full-processed.txt", "w") as f:
        f.write('\n'.join(full_data))

In [None]:
gather_20newsgroup_data()

### Basic knowledge about TF - TDF

About TF - TDF, the higher the numerical weight value, the rarer the term. The smaller the weight, the more common the term.

$\star$ The TF(Term Frequency) of a word is the frequency (i.e number of times it appears) of a word in a document, the TF of a word is following the formula:  
   $tf(t, d)$ = $\dfrac{f(t, d)}{max\left\{f(t, d) | t \in D\right\}}$, $D$ is the corpus.
   
$\star$ The IDF (Inverse Document Frequency) of a word is the measure of how significant that term is in the whole corpus (a body of documents). The IDF of a word is following the formula:
   $idf(t, D)$ = $\log(\dfrac{N}{N(t)})$, $N$ is the number of documents in the corpus and $N(t)$ is the number of documents in the corpus that the term $t$ appears

So, we have $tf-idf(t, d, D) = tf(t, d).idf(t, D)$

## Step 2: Generate dictionary and compute the $idf$ value

In [9]:
def generate_dictionary(data_path):
    def compute_idf(df, corpus_size):
        assert df > 0
        return np.log10(corpus_size * 1./df)
    
    with open(data_path) as f:
        lines = f.read().splitlines()
    
    """ doc_count is the list with keys are distince word in the dictionary and the values is the number of documents
    containing that word in the corpus """
    
    doc_count = defaultdict(int)
    corpus_size = len(lines)
    
    for line in lines:
        line = line.split("<fff>")
        text = list(set(line[-1].split()))
        for word in text:
            doc_count[word] += 1
    words_idf = []
    for word in dict(doc_count):
        idf_word = compute_idf(doc_count[word], corpus_size)
        line = word + "<fff>" + str(idf_word)
        words_idf.append(line)
    with open("20news-bydate/words_idf.txt", "w") as file:
        file.write('\n'.join(words_idf))

In [None]:
generate_dictionary("20news-bydate/20news-full-processed.txt")

In [25]:
def data_tf_idf(datapath):
    # dictionary: {word: (idf of that word, the ordered of the word)}
    with open("20news-bydate/words_idf.txt") as file:
        content = file.read().splitlines()
        words_idf = {}
        for index, line in enumerate(content):
            words_idf[line.split("<fff>")[0]] = (line.split("<fff>")[1], index)
            
    # list: [(newsgroup_id, filename, all the words)]
    with open(datapath) as file:
        lines = file.read().splitlines()
        documents = []
        for line in lines:
            documents.append((line.split("<fff>")[0], line.split("<fff>")[1], line.split("<fff>")[2]))
    
    data_tf_idf = []
    for document in documents:
        label, filename, text = document
        content = text.split()
        words = list(set(content))
        
        # the word with the maximum frequency in content
        max_frequency = 0
        for word in words:
            max_frequency = max(max_frequency, content.count(word))
            
        line = label + "<fff>" + filename + "<fff>"
        for word in words:
             word_tf = content.count(word) * 1./max_frequency
             word_tf_idf = word_tf * float(words_idf[word][0])
             line += str(words_idf[word][1]) + ": " + str(word_tf_idf) + " "
        data_tf_idf.append(line)
    with open("20news-bydate/data_tf_idf.txt", "w") as file:
        file.write('\n'.join(data_tf_idf))
    

In [27]:
data_tf_idf("20news-bydate/20news-full-processed.txt")