## Text Classification Using Naive Bayes
### Your task is to:
    1. Perform Test Classification using Multinomial Naive Bayes(already implemented in sklearn).
    2. Implement Naive Bayes on your own from scratch for text classification. 
    3. Compare Results of your implementation of Naive Bayes with one in Sklearn.
#### Dataset - 
    http://archive.ics.uci.edu/ml/datasets/Twenty+Newsgroups
#### Comments : 
    Your code must have proper comments for better understanding.
#### Score : 
    Score will be given by the TA based on your submission.
#### Submission : 
    You have to upload zipped file which has python notebook with implementation and dataset used.
#### Your project will be evaluated on following parameters -
    1. Correctness of Code - Own Implementation Naive Bayes (Max Score 50)
    2. Comparison (Max Score 10)
    3. Commenting (Max Score 10)
    4. Correctness of Code - Sklearn Naive Bayes (Max Score 30)

In [1]:
# Data Cleaning

In [2]:
import re
headers_list = []
s = "Xref: \
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!bb3.andrew.cmu.edu!news.sei.cmu.edu!cis.ohio-state.edu!magnus.acs.ohio-state.edu!usenet.ins.cwru.edu!agate!spool.mu.edu!uunet!pipex!ibmpcug!mantis!mathew \
From: mathew <mathew@mantis.co.uk> \
Newsgroups: alt.atheism,alt.atheism.moderated,news.answers,alt.answers \
Subject: Alt.Atheism FAQ: Atheist Resources \
Message-ID: <19930329115719@mantis.co.uk> \
Date: Mon, 29 Mar 1993  GMT \
Expires: Thu, 29 Apr 1993 11 GMT \
Followup-To: alt.atheism \
Distribution: world \
Organization: Mantis Consultants, Cambridge. UK. \
Approved: news-answers-request@mit.edu \
Supersedes: <19930301143317@mantis.co.uk \
Archive-name: atheism/resources \
Alt-atheism-archive-name: resources \
Last-modified: 11 December 1992 \
Writeto:  FFRF, P.O. Box 750, Madison, WI 53701. \
Summary: Books, addresses, music -- anything related to atheism \
Keywords: FAQ, atheism, books, music, fiction, addresses, contacts \
Version: 1.0 \
Lines: 290 \
Telephone: (608) 256-8900 \
or:  7215 Cameron Road, Austin, TX 78752-2973. \
Telephone: (512) 458-1244 \
Fax:       (512) 467-9525 "

words = s.split(' ')

for word in words :
    if re.search("\w\:", word) != None:
        headers_list.append(word)
headers_list[headers_list.index('Writeto:')] = 'Write to:'
print(headers_list)


['Xref:', 'Path:', 'From:', 'Newsgroups:', 'Subject:', 'FAQ:', 'Message-ID:', 'Date:', 'Expires:', 'Followup-To:', 'Distribution:', 'Organization:', 'Approved:', 'Supersedes:', 'Archive-name:', 'Alt-atheism-archive-name:', 'Last-modified:', 'Write to:', 'Summary:', 'Keywords:', 'Version:', 'Lines:', 'Telephone:', 'or:', 'Telephone:', 'Fax:']


In [7]:
def preprocess_corpus(corpus, headers_list):
    
    from nltk.corpus import stopwords
    from spacy.lang.en.stop_words import STOP_WORDS
    from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

    ENGLISH_STOP_WORDS_LIST = list(ENGLISH_STOP_WORDS)
    STOP_WORDS_LIST = list(STOP_WORDS)

    stop_words = list(set(stopwords.words('english') + ENGLISH_STOP_WORDS_LIST + STOP_WORDS_LIST))
    """
    #print(stop_words)
    headers_removed_corpus = []
    for line in corpus:
        #line = line.lower()
        line = line.strip()
        if line == '' :
            continue
        is_header = False
        for header in headers_list:
            if header in line:
                is_header = True
        if not is_header :
            headers_removed_corpus.append(line)
        #else:
        #    print(line)
    #pprint(headers_removed_corpus)
    """ 
    import copy
    headers_removed_corpus = copy.deepcopy(corpus)
    import string
    def to_english(s):
        arr = re.sub('\s', ' ', s)
        arr = arr.split(' ')
        retval = []
        for word in arr:
            if word.translate(string.punctuation).isalnum() : 
                retval.append(word.strip())
        return ' '.join(retval)

    non_english_and_punct_removed = []
    for line in headers_removed_corpus:
        clean_line = []
        #print(line)
        line = re.sub("[:,-]", ' ', line) #,'!"#$%&()*,-.:;<=>?@^_`{|}~'
        line = re.sub("[!\"#$%&\'()\*\+,\-\./:;<=>?@\[\\\]^_`{|}~]", ' ', line) #,'!"#$%&()*,-.:;<=>?@^_`{|}~'
        #print(line)
        words = line.split(' ')
        for word in words:
            clean_line.append(to_english(word).strip())
        clean_line = (' '.join(clean_line)).strip()
        clean_line = re.sub('\s +', ' ', clean_line)

        if clean_line != '':
            non_english_and_punct_removed.append(clean_line)
    #pprint(non_english_and_punct_removed)

    # remove stopwords

    stop_words_removed = []
    for line in non_english_and_punct_removed:
        words = line.split(' ')
        new_line = []
        for word in words:
            word = re.sub("[0-9]+", '', word)
            word = re.sub("\s", ' ', word)
            word = word.strip().lower()
            if word == '' :
                continue
            if word not in stop_words:
                new_line.append(word)
        new_line = ' '.join(new_line)
        if new_line != '' :
            stop_words_removed.append(new_line)
    #pprint(stop_words_removed)

    final_data = '.'.join(stop_words_removed)
    return final_data

In [5]:
document_paths = []
from pprint import pprint
import os
walk = os.walk('.\\20_newsgroups', topdown = False)
for root, dirs, files in walk :
    for file in files:
        doc = {}
        doc['root'] = root
        doc['file'] = file
        document_paths.append(doc)
pprint(document_paths[0:100])

[{'file': '49960', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51060', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51119', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51120', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51121', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51122', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51123', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51124', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51125', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51126', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51127', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51128', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51129', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51130', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51131', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51132', 'root': '.\\20_newsgroups\\alt.atheism'},
 {'file': '51133', 'root

In [8]:
corpus = ""
i = 0
import time
st = time.time()
new_paths = []
for doc_path in document_paths:
    #if i == 2 :
    #    break
    path = doc_path['root'] + "\\" + doc_path['file']
    with open(path) as doc :
        data = doc.readlines()
        i += 1
        clean_corpus = preprocess_corpus(data, headers_list)
        clean_data_file_root = doc_path['root'].replace('.\\','.\\clean_data\\')
        os.makedirs(clean_data_file_root, exist_ok = True)
        clean_data_file_path = clean_data_file_root + "\\" +doc_path['file'] + '.txt'
        with open(clean_data_file_path, 'wb') as file_clean_data :
            file_clean_data.write(bytes(clean_corpus,'utf8'))
            file_clean_data.close()
        doc.close()
        if i%500 == 0 :
            print( i, "Files Processed in", time.time() - st, "sec")
            st = time.time()


500 Files Processed in 9.946960926055908 sec
1000 Files Processed in 9.583010911941528 sec
1500 Files Processed in 8.551152467727661 sec
2000 Files Processed in 9.55315637588501 sec
2500 Files Processed in 8.824671268463135 sec
3000 Files Processed in 9.561097621917725 sec
3500 Files Processed in 8.5254647731781 sec
4000 Files Processed in 9.537110567092896 sec
4500 Files Processed in 9.063798427581787 sec
5000 Files Processed in 10.288093090057373 sec
5500 Files Processed in 15.47367548942566 sec
6000 Files Processed in 12.05720829963684 sec
6500 Files Processed in 11.774779081344604 sec
7000 Files Processed in 8.843207836151123 sec
7500 Files Processed in 8.858887195587158 sec
8000 Files Processed in 8.154688119888306 sec
8500 Files Processed in 8.03833818435669 sec
9000 Files Processed in 7.866506099700928 sec
9500 Files Processed in 8.678372144699097 sec
10000 Files Processed in 9.002585649490356 sec
10500 Files Processed in 9.519327878952026 sec
11000 Files Processed in 8.98786640

In [9]:
pprint(new_paths)

[]
