# Data 620 Assignment: Document Classification

Jithendra Seneviratne, Sheryl Piechocki 

July 3, 2020

### Import Modules and Libraries for Analysis

In [1]:
import string
import re
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd

### Load Text

We'll be using text from a series of BBC articles found in [Kaggle](https://www.kaggle.com/yufengdev/bbc-fulltext-and-category?select=bbc-text.csv), categorized as one of the following:

* Tech
* Business
* Sport
* Entertainment
* Politics

The corpus consists of 2225 articles.

In [2]:
text_df = pd.read_csv('bbc-text.csv')

In [3]:
text_df.shape

(2225, 2)

In [4]:
text_df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


### Create list of stopwords

In [5]:
stop = stopwords.words('english') + ['mr',
                                     'mrs',
                                     'miss', 
                                     'say',
                                     'have', 
                                     'might',
                                     'thought',
                                     'would', 
                                     'could', 
                                     'make', 
                                     'much',
                                     'dear',
                                     'must',
                                     'know',
                                     'one',
                                     'good',
                                     'every',
                                     'towards',
                                     'give',
                                     'dr',
                                     'none',
                                     'go',
                                     'come',
                                     'upon',
                                     'get',
                                     'see',
                                     'like',
                                     'appear',
                                     'sometimes',
                                     'the',
                                     'and',
                                     'a',
                                     'be',
                                     'i',
                                     'of',
                                     'to',
                                     'have',
                                     'in',
                                     'he',
                                     'that',
                                     'you',
                                     'it',
                                     'his',
                                     'my',
                                     'with',
                                     'for',
                                     'on',
                                     'say',
                                     'but',
                                     'me',
                                     'at',
                                     'we',
                                     'all',
                                     'not',
                                     'this',
                                     'by',
                                     'him',
                                     'one',
                                     'there',
                                     'now',
                                     'man',
                                     'so',
                                     'do',
                                     'out',
                                     'they',
                                     'go',
                                     'well',
                                     'from',
                                     'come',
                                     'if',
                                     'like',
                                     'up',
                                     'see',
                                     'no',
                                     'when',
                                     'put',
                                     'take',
                                     'begin',
                                     'two',
                                     'three',
                                     'u',
                                     'still',
                                     'last',
                                     'never',
                                     'always',
                                     'thing',
                                     'tell']

### Create Clean and Lemmatize Functions 

In [6]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'
    
def lemmatize_word(word):
    lemmatizer = WordNetLemmatizer()
    try:
        tag = get_wordnet_pos(pos_tag([word])[0][1])
        return lemmatizer.lemmatize(word, pos=tag)
    except:
        pass
    
def clean_doc(doc):

    line= re.sub('[%s]' % re.escape(string.punctuation), '', doc)
    line = re.sub('[^a-zA-Z\ ]', '', line)
    line = line.lower()
    line = line.split()
    line = ' '.join([lemmatize_word(x) for x in line if lemmatize_word(x) not in stop])
    return line

### Apply cleaning and lemmatizing functions to corpus  
We clean the corpus by removing punctuation, lemmatizing the words, and removing stop words.

In [7]:
text_df['cleaned_text'] = text_df['text'].apply(lambda x: clean_doc(x))

In [8]:
text_df.head()

Unnamed: 0,category,text,cleaned_text
0,tech,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...
1,business,worldcom boss left books alone former worldc...,worldcom bos left book alone former worldcom b...
2,sport,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester rush bid a...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership side...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelve raid box office ocean twelve crim...


### Function to vectorize corpus using TFIDF  
The term frequency-inverse document frequency measure places importance on terms that are more frequent in a document, but are not frequent in all documents. 

In [9]:
def vectorize_text(df,
                   maxdf=.5,
                   mindf=5,
                   ngram_range=(1, 1),
                   stop_words=stop):
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range, 
                                       max_df=maxdf, 
                                       min_df=mindf, 
                                       stop_words=stop_words)
    tfidf = tfidf_vectorizer.fit_transform(text_df['cleaned_text'])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    df_transform = pd.DataFrame(tfidf.toarray())
    df_transform.columns = tfidf_feature_names
    df_transform['category'] = text_df['category']
    
    return df_transform

### Apply transformation to corpus

In [10]:
df_transform = vectorize_text(text_df)

In [11]:
df_transform.head()

Unnamed: 0,aaa,aaron,abandon,abandonment,abbott,abc,abide,ability,able,abn,...,zach,zealand,zeppelin,zero,zhang,zimbabwe,zombie,zone,zoom,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.056514,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041475,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create logistic classifier function

In [16]:
def logistic_classifier(df):
    
    X = df.drop(labels=['category'],
                axis=1)
        
    y = df['category']
    lr = LogisticRegression(penalty='l2',
                            dual=False,
                            tol=.0001,
                            C=1,
                            )
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.3)

    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    print('Test Accuracy','{:.1%}'.format(lr.score(X_test, y_test)))
    print('Train Accuracy','{:.1%}'.format(lr.score(X_train, y_train)))
    print('Confusion Matrix')
    print(confusion_matrix(y_test, y_pred))

### Classify documents

In [17]:
logistic_classifier(df_transform)

Test Accuracy 96.4%
Train Accuracy 99.8%
Confusion Matrix
[[151   0   6   0   1]
 [  1 114   2   0   0]
 [  1   2 116   2   0]
 [  0   0   0 159   0]
 [  1   3   1   4 104]]


Our model seems to have done very well, classifying test data with high accuracy. Let's see if we can do better by changing the ngram range.

### Change N-Gram Range

In [14]:
df_transform = vectorize_text(text_df,
                              ngram_range=(1, 2))
logistic_classifier(df_transform)

Test Accuracy 96.9%
Train Accuracy 99.5%
Confusion Matrix
[[142   0   2   0   1]
 [  1 100   2   0   1]
 [  8   0 129   0   0]
 [  0   0   0 155   0]
 [  2   1   1   2 121]]


As we can see, changing the number of ngrams from a max of one word to two words has helped let's try changing it to three

In [15]:
df_transform = vectorize_text(text_df,
                              ngram_range=(1, 3))
logistic_classifier(df_transform)

Test Accuracy 97.5%
Train Accuracy 99.6%
Confusion Matrix
[[164   0   1   1   3]
 [  2 116   1   0   0]
 [  4   0 109   0   2]
 [  0   0   0 142   0]
 [  1   1   1   0 120]]


As we can see, now the model is overfit

### Conclusions  
