## Model training/1

### We use the stored files in the 'dataset' directory as our corpus

In [1]:
import os
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

corpusdir = 'dataset/' # Directory of corpus.
song_lyrics = PlaintextCorpusReader(corpusdir, '.*')

### First of all, we train the model without preprocessing the documents

In [2]:
categories = ['happy', 'sad']
documents = []

for fileid in song_lyrics.fileids():
    for category in categories:
        if category in fileid:
            documents.append((list(song_lyrics.words(fileid)), category))

import random
random.seed(123)
random.shuffle(documents)

In [3]:
print(song_lyrics.words())

all_words = nltk.FreqDist(w.lower() for w in song_lyrics.words())
most_common_words = all_words.most_common(1000)
word_features = [x for (x, _) in most_common_words]

['Well', 'you', 'done', 'done', 'me', 'and', 'you', ...]


In [4]:
# Our binary feature function
def document_features(document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

import math
featuresets = [(document_features(d), c) for (d,c) in documents]
n = math.floor(len(featuresets) * 0.8)
train_set, test_set = featuresets[:n], featuresets[n:]

In [5]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

unlabeled_test_set = list()
actual_label = list()

for (d,c) in test_set:
    unlabeled_test_set.append(d)
    actual_label.append(c)
    

predicted_label = classifier.classify_many(unlabeled_test_set)

from nltk.metrics import ConfusionMatrix
cm = ConfusionMatrix(actual_label, predicted_label)
print(cm)

      |   h     |
      |   a     |
      |   p   s |
      |   p   a |
      |   y   d |
------+---------+
happy |<171>129 |
  sad |  76<291>|
------+---------+
(row = reference; col = test)



In [6]:
def print_precision_recall(cm):
    precision = cm['happy', 'happy'] / (cm['happy', 'happy'] + cm['sad', 'happy'])
    print("Precision is %s" % precision)

    recall = cm['happy', 'happy'] / (cm['happy', 'happy'] + cm['happy', 'sad'])
    print("Recall is %s" % recall)

    f_measure = (2 * recall * precision) / (precision + recall)
    print("F-measure is %s" % f_measure)
    
print("Accuracy is %s" % nltk.classify.accuracy(classifier, test_set))
print_precision_recall(cm)

Accuracy is 0.6926536731634183
Precision is 0.6923076923076923
Recall is 0.57
F-measure is 0.6252285191956124


### Now, we train the model preprocessing the documents. For preprocessing we can use various stemming methods. 
### The best found solution is: SnowballStemmer (ignoring the stemming of stopwords) and without stopwords removal.

In [7]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
import re

def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))

# This function does all the textual preprocessing steps
def preprocessing(string_to_process):
    normalized_string = string_to_process.lower().replace("_", "")
    # 1. Tokenize it! This also removes non alphanumeric characters since we're tokenizing words only.
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(normalized_string)
    
    # 2. Remove stopwords
    #filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    # 3. Remove numbers
    filtered_tokens = [word for word in tokens if not hasNumbers(word)]
    
    # 4. Stemm it!
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #stemmer = SnowballStemmer("english")
    #stemmer = LancasterStemmer()
    #stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    return stemmed_tokens
    

preprocessed_documents = list()

for document in documents:
    i = documents.index(document)
    if (i % 200 == 0):
        print("Document %s " % i)
    document_text = ' '.join(document[0])
    preprocessed_text = preprocessing(document_text)
    preprocessed_documents.append((preprocessed_text, document[1]))
print("done!")

Document 0 
Document 200 
Document 400 
Document 600 
Document 800 
Document 1000 
Document 1200 
Document 1400 
Document 1600 
Document 1800 
Document 2000 
Document 2200 
Document 2400 
Document 2600 
Document 2800 
Document 3000 
Document 3200 
done!


In [8]:
# Get most frequent words from list of documents
def get_most_frequent_words(documents, k=200):
    all_words = list()
    for document in documents:
        for word in document[0]:
            all_words.append(word)
    
    all_words_frequency = nltk.FreqDist(all_words)
    most_common_words = all_words_frequency.most_common(k)
    return [x for (x, _) in most_common_words]

word_features = get_most_frequent_words(preprocessed_documents, k=1000)

In [9]:
featuresets = [(document_features(d), c) for (d,c) in preprocessed_documents]
n = math.floor(len(featuresets) * 0.8)
train_set, test_set = featuresets[:n], featuresets[n:]

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [10]:
unlabeled_test_set = list()
actual_label = list()

for (d,c) in test_set:
    unlabeled_test_set.append(d)
    actual_label.append(c)
    

predicted_label = classifier.classify_many(unlabeled_test_set)

from nltk.metrics import ConfusionMatrix
cm = ConfusionMatrix(actual_label, predicted_label)
print(cm)
print("Accuracy is %s" % nltk.classify.accuracy(classifier, test_set))
print(print_precision_recall(cm))
print(classifier.show_most_informative_features(15))

      |   h     |
      |   a     |
      |   p   s |
      |   p   a |
      |   y   d |
------+---------+
happy |<168>132 |
  sad |  77<290>|
------+---------+
(row = reference; col = test)

Accuracy is 0.6866566716641679
Precision is 0.6857142857142857
Recall is 0.56
F-measure is 0.6165137614678899
None
Most Informative Features
           contains(woo) = True            happy : sad    =     13.0 : 1.0
        contains(dancin) = True            happy : sad    =      9.4 : 1.0
           contains(wow) = True            happy : sad    =      8.5 : 1.0
            contains(eh) = True            happy : sad    =      8.5 : 1.0
          contains(clap) = True            happy : sad    =      6.7 : 1.0
          contains(boom) = True            happy : sad    =      6.2 : 1.0
          contains(whoa) = True            happy : sad    =      6.1 : 1.0
             contains(e) = True            happy : sad    =      5.9 : 1.0
      contains(american) = True            happy : sad    =      5

### Now, we consider n-grams

In [11]:
def find_k_best_bigrams_in_all_documents(documents, k=200):
    all_words = list()
    for document in documents:
        for word in document[0]:
            all_words.append(word)
    all_bigrams = nltk.bigrams(all_words)
    bigram_distributions = nltk.FreqDist(all_bigrams)
    most_frequent_bigrams = bigram_distributions.most_common(k)
    return [x for (x, _) in most_frequent_bigrams]

bigram_features = find_k_best_bigrams_in_all_documents(preprocessed_documents, k=1000)

def document_bigram_words_feature(document):
    document_bigrams = list(nltk.bigrams(document))
    result = dict((bigram,0) for bigram in bigram_features)
    for bigram in result.keys():
        result[bigram] = bigram in document_bigrams
    return result

def document_unigram_and_bigrams_words_feature(document):
    unigram_feature_vector = document_features(document)
    bigram_feature_vector = document_bigram_words_feature(document)
    return {**unigram_feature_vector, **bigram_feature_vector}


In [None]:
featuresets = [(document_unigram_and_bigrams_words_feature(d), c) for (d,c) in preprocessed_documents]
n = math.floor(len(featuresets) * 0.8)
train_set, test_set = featuresets[:n], featuresets[n:]

classifier = nltk.NaiveBayesClassifier.train(train_set)

unlabeled_test_set = list()
actual_label = list()

for (d,c) in test_set:
    unlabeled_test_set.append(d)
    actual_label.append(c)
    

predicted_label = classifier.classify_many(unlabeled_test_set)

from nltk.metrics import ConfusionMatrix
cm = ConfusionMatrix(actual_label, predicted_label)
print(cm)
print("Accuracy is %s" % nltk.classify.accuracy(classifier, test_set))
print(print_precision_recall(cm))
print(classifier.show_most_informative_features(15))

      |   h     |
      |   a     |
      |   p   s |
      |   p   a |
      |   y   d |
------+---------+
happy |<173>127 |
  sad |  75<292>|
------+---------+
(row = reference; col = test)

