### Text Preprocessing  

#### 1.1 Reading the files

We begin by reading all the articles training and testing into an appropriate data structure

In [None]:
import re
with open("training_testing_combined.txt",'r',encoding = 'utf8') as fn:
    content = fn.read().lower()

#Extracting all the articles and their names from the file
articles = re.findall(r'id t[re]_doc_[0-9]+[\s]text(.*?)eod', content, flags = re.S )
article_names = re.findall(r'id (t[re]_doc_[0-9]+)', content, flags = re.S)

#### 1.2 Finding Bigrams and generating tokens

We now try to find 100 most meaningful bigrams using nltk library. We would then use these bigrams to tokenise our documents.

In [None]:
from nltk.tokenize import RegexpTokenizer
from itertools import chain

tokenizer = RegexpTokenizer(r"\w+(?:[-_]\w*)?")

unigram_articles = []
for article in articles:
    #tokenise the articles
    tokens = tokenizer.tokenize(article)
    unigram_articles.append(tokens)
    
#generating list of tokens to find meaningful bigrams    
words = list(chain.from_iterable(tokenized_articles))

In [None]:
import nltk.collocations
#Removing possible number tokens from words list prior to finding bigrams
words = [word for word in words if (word.isdigit() == False and len(word) > 1) ]
#Finding possible bigrams
finder = nltk.collocations.BigramCollocationFinder.from_words(words)

with open('stopwords_en.txt','r') as stopwords:
    stopword = stopwords.readlines()
    
stopword = [word.rstrip() for word in stopword_content] 

#Applying filters to bigrams to get rid of any stop words in collocation and bigrams with low frequency as they carry little information
finder.apply_word_filter(lambda word: word in stopword)
finder.apply_freq_filter(3)

#Applying measuring technique to identify first 200 collocations (meaningful bigrams)
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_vocab = list(finder.nbest(bigram_measures.likelihood_ratio, 100))
print("100 Most commonly appearing bigrams\n",bigram_vocab)

In [None]:
#bigram_vocab = [('south', 'wales'), ('prime', 'minister'), ('united', 'states'), ('chief', 'executive'), ('federal', 'government'), ('world', 'number'), ('years', 'ago'), ('abc', 'news'), ('opposition', 'leader'), ('state', 'government'), ('australian', 'open'), ('north', 'queensland'), ('western', 'australia'), ('south', 'australia'), ('fire', 'service'), ('minister', 'john'), ('south', 'australian'), ('local', 'time'), ('weeks', 'ago'), ('friday', 'night'), ('saturday', 'night'), ('central', 'coast'), ('western', 'australian'), ('health', 'minister'), ('news', 'abc'), ('home', 'side'), ('months', 'ago'), ('city', 'council'), ('central', 'queensland'), ('federal', 'opposition'), ('long', 'time'), ('country', 'fire'), ('north', 'coast'), ('local', 'government'), ('western', "australia's"), ('south', 'coast'), ('recent', 'years'), ('australian', 'government'), ('told', 'abc'), ('world', 'record'), ('south', "australia's"), ('wales', 'government'), ('high', 'court'), ('killed', 'people'), ('health', 'department'), ('good', 'news'), ('federal', 'police'), ('lost', 'control'), ('federal', 'court'), ('match', 'point'), ('health', 'services'), ('public', 'health'), ('melbourne', 'victory'), ('chief', 'minister'), ('days', 'ago'), ('minister', 'peter'), ('million', 'people'), ('yesterday', 'morning'), ('queensland', 'premier'), ('recent', 'weeks'), ('coach', 'john'), ('queensland', 'government'), ('queensland', 'health'), ('west', 'coast'), ('western', 'sydney'), ('south', 'sydney'), ('services', 'minister'), ('hard', 'work'), ('west', 'australian'), ('centre', 'court'), ('sunday', 'morning'), ('health', 'service'), ('premier', 'peter'), ('man', 'died'), ('saturday', 'morning'), ('wales', 'premier'), ('hit', 'back'), ('year', 'ago'), ('coming', 'back'), ('open', 'final'), ('central', 'west'), ('earlier', 'today'), ('put', 'forward'), ('taking', 'part'), ('court', 'today'), ('australian', 'federal'), ('sunday', 'night'), ('working', 'hard'), ('people', 'died'), ('recent', 'months'), ('premier', 'john'), ('police', 'chief'), ('early', 'hours'), ('late', 'yesterday'), ('coming', 'days'), ('people', 'including'), ('week', 'ago'), ('make', 'decision'), ('past', 'year'), ('past', 'years')]

We now use multiword tokeniser to tokenise documents into unigrams and bigrams.

#### 1.3 Finding unwanted tokens

We now count the document frequency of each token to determine if they will be cosidered into our vocab. 

In [None]:
import nltk
doc_freq = {}

tokenizer = nltk.tokenize.mwe.MWETokenizer(bigram_vocab, separator = '_')

tokenized_articles = []
for doc_number in range(len(unigram_articles)):
    #tokenise the articles
    token_article = tokenizer.tokenize(unigram_articles[doc_number])
    tokenized_articles.append(token_article)
    unique_tokens = set(token_article)
    
    #count occurence of each token in number of articles
    for token in unique_tokens:
        if token in doc_freq:
            doc_freq[token] = doc_freq[token] + 1
        else:
            doc_freq[token] = 1

In [None]:
#obtaining tokens occuring in 95% and 3000 of the documents and that are of length less than 3
tokens_95_threshold = []            
tokens_rare_token = []
token_len_3 = []

no_of_docs = len(articles)

#tokens with document frequency greater than 95% and less than 3000
for token,count in doc_freq.items():
    if count > (.95 * no_of_docs):
        tokens_95_threshold.append(token)
    elif count < 3000:
        tokens_rare_token.append(token)

#tokens of lenght less than 3
for token in doc_freq.keys():
    if len(token) < 3:
        token_len_3.append(token)

In [None]:
#Obtaining a list of stopwords for english
with open('stopwords_en.txt','r') as stopwords:
    stopword_content = stopwords.readlines()
    
stopword_content = [word.rstrip() for word in stopword_content] 

All the tokens that are occurring in more than 95% of the documents, are of length less than 2 and have a document frequence less than 3000 are considered as unwanted and we would remove them from our vocab/feature set.

In [None]:
#Combining all the tokens that needs to be removed including stop words
for rare_token in tokens_rare_token:
    stopword_content.append(rare_token)
    
for token in token_len_3:
    stopword_content.append(token)
    
for token in tokens_95_threshold:
    stopword_content.append(token)

#a set of all the unrequired tokens
stopword_content_set = set(stopword_content)

#### 1.4 tagging, lemmatizing and stemming

In order to get a better vocab/feature set we first tag all the tokens. This is followed by lemmatization which reduces the words to its lexical base form. Once lemmatised we remove all the stop words from our vocab and stem the words. Stemming helps in reducing two similar words to root form.

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
lemmatized_articles = []
lemmatizer = WordNetLemmatizer()

for article in tokenized_articles:
    tagged_article =  nltk.tag.pos_tag(article)
    stopped_tagged_article = [word for word in tagged_article if word[0] not in stopword_content_set]
    lemmat_article = [lemmatizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in stopped_tagged_article]
    lemmatized_articles.append(lemmat_article)

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stemmed_articles = []
for article in lemmatized_articles:
    stem_article = [stemmer.stem(token) for token in article]
    stemmed_articles.append(stem_article)

#### 1.5 vectorising and generation of TD-IF matirx

After obtaining stemmed tokens and removing unwanted tokens we now procceed to generate a TD-IF matrix for all the tokens and documents.

In [None]:
#obtaining clean articles in text format for vectorization
text_articles = [' '.join(article) for article in stemmed_articles]

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

#Initialising vectorizer
vectorizer = TfidfVectorizer(input = 'content',analyzer = 'word', token_pattern = r"\w+(?:[-_]\w*)?" )

#Obtaining tf-ids for each term in the articles
article_vector = vectorizer.fit_transform(text_articles).todense()

#obtaining vectorised term frequency of each article in a dataframe
articles_transformed = pd.DataFrame(article_vector, columns=vectorizer.get_feature_names())

In [None]:
articles_transformed.head()

#### 1.6 Generating csv with article lables

We now convert the generated TD-IDF matirix into a csv file. We split the data into test and traininig files and update training data with lables.

In [None]:
#Adding article names to the vectorised article dataframe
article_names = pd.Series(article_names)
articles_transformed["article_names"] = article_names.values

In [None]:
#Splitting data into training and test data
training_data = articles_transformed.iloc[0:106445,]
test_data = articles_transformed.iloc[106445:,]

In [None]:
#Obtaining article lables
lable_file = open("training_labels_final.txt",'r')
article_lable = {}
for line in lable_file:
    match = re.match(r'([\w_]+)\s([C]\w+)',line)
    article_lable[match.group(1)] = match.group(2)
lable_file.close()

#function to map lables
def lable_map(article_name):
    global article_lable
    
    return article_lable[article_name]

#Updating tranformed articles data frame with article lables
training_data["article_lable"] = training_data["article_names"].apply(lable_map)


In [None]:
#saving tranformed vectorised articles in a csv for further analysis
training_data.to_csv('training_data1.csv', sep=',', encoding='utf-8', index = False)
test_data.to_csv('test_data1.csv', sep=',', encoding='utf-8', index = False)