In [1]:
#############  The classification reports are shown at the bottom of the notebook ##########

import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WordPunctTokenizer
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from numpy import array
import string
from sklearn.metrics import classification_report
from nltk.tag import CRFTagger
from collections import Counter
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree
from nltk.chunk import tree2conlltags
import spacy
nlp = spacy.load("en_core_web_sm")
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
def spacystring(Text):  #Cleans the data and output spacy string and is used in toFeatureVector to tag token using spacy function
    
    string.punctuation

    # word tokenisation
    Text = Text.strip(string.punctuation)
    Text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", Text) #separating words and special charaters from words that comes after the word
    Text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", Text) #separating words and special charaters from words that comes before the word
    Text =  re.sub(r'[^\w\s]','',Text)
    Text2 = re.sub(r'\_','',Text)
    

    doc = nlp(Text2)                   #transforms sentence into spacy string
    
    return doc
    

In [3]:
def loadData(path, Text=None):
    with open(path, encoding = 'utf-8') as f:                                                                                           
        csv_reader = csv.reader(f, delimiter=',')

        for line in csv_reader:
            (Lines, Char, Gender) = parseReview(line) #adds data as tuple to be appended to rawData
            rawData.append((Lines, Char, Gender)) #appends 3 features
            
def loadTest(path, Text=None):
    with open(path, encoding = 'utf-8') as f:                                                                                           
        csv_reader = csv.reader(f, delimiter=',')

        for line in csv_reader:
            (Lines, Char, Gender) = parseReview(line) #adds data as tuple to be appended to rawData
            rawTest.append((Lines, Char, Gender)) #appends 3 features

            
def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (Lines, _, Gender) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]: #train data percentage based on split
        trainData.append((toFeatureVector(preProcess(Lines),spacystring(Lines)),Gender))
#     for (Lines, _, Gender) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]: #uncomment to test on validation set
#         testData.append((toFeatureVector(preProcess(Lines),spacystring(Lines)),Gender))
    for (Lines, _, Gender) in rawTest:
        testData.append((toFeatureVector(preProcess(Lines),spacystring(Lines)),Gender)) #Process the test data and append them to testData

In [5]:
# Convert line from input file into an id/text/label tuple
def parseReview(line):
    # Should return a tuple of an integer, a string containing the review, and a string indicating the label 
    
    Lines = str(line[0])
    Char = str(line[1])
    Gender = str(line[2])
    
 
    # DESCRIBE YOUR METHOD IN WORDS
    #Feeding lines of data into its corresponding object name and returning it to later 
    #be called in the loadData function to be appended to rawData list
    
    return (Lines, Char, Gender) #returning these 3 features

In [6]:
def preProcess(Text):
# Should return a list of tokens
   # string.punctuation

    #print("original:", Text)

    # word tokenisation
    Text = Text.strip(string.punctuation)
    Text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", Text) #separating words and special charaters from words that comes after the word
    Text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", Text) #separating words and special charaters from words that comes before the word
    Text =  re.sub(r'[^\w\s]','',Text)
    Text = re.sub(r'\_','',Text)
#    Text =  "".join([w for w in Text2 if w not in string.punctuation])

    tokens = re.split(r"\s+",Text)   #tokenize
    # normalisation
    tokens = [t.lower() for t in tokens]   #lower casing the words
    
#     # stopword removal
#     stop = set(stopwords.words('english'))  #instantiate stop object that contains all the stopwords
#     tokens = [t for t in tokens if t not in stop]  #removes stopwords from sentence
    
    return tokens


In [24]:
# Should return a dictionary containing features as keys, and weights as values

featureDict = {} # A global dictionary of features
emptyline = ""   # object of empty string to check for empty lines
empty = "Empty"  # object with the string 'empty'
            
posttagger = CRFTagger()
posttagger.set_model_file("crf_pos.tagger")



def toFeatureVector(tokens,spacytok):
    tagged = posttagger.tag(tokens)        #instatiate the postagger
    localFeat = {}                         #Local dictionary to be returned

    
    if not tokens[0]==False and (tokens[0].strip() == emptyline):  #Checks if the first element of the token is empty
        localFeat[empty] = 1           #Adds the word "Empty" and weight 1 if its empty, else continue as usual.
        
    else:
######### 1. Weighted word count for all words in sentence ##########
#####################################################################
        for words in tokens:
            try:
                i = featureDict[words] #Tries adding the weights to existing tokens that's been calculated
            except KeyError:
                i = len(featureDict) + 1  #adds 1 to length of feature dict
                featureDict[words] = i #adds the word and its number of frequency
            try:
                localFeat[words] += (1.0/len(tokens)) #Tries adding the weights to existing tokens that's been calculated
            except KeyError:
                localFeat[words] = (1.0/len(tokens)) #If error, code will exucute this line and add the word to dictionary
        
######### 2. Adding postags as keys and the counts as weights #########
#######################################################################

#         for word in tagged:
#             try:
#                 localFeat[word[1]] += (1.0/len(tagged)) #Tries adding the weights to existing postags. 
#             except KeyError:
#                 localFeat[word[1]] = (1.0/len(tagged)) #If the tag is not present in dictionary, this line will add to it along with its weight.

######### 3. Adding word as keys and postags data #####################
#######################################################################

#         for word in tagged:           
#             localFeat[word[0]] = (word[1]) #Adding word as keys and postags data

######### 4. Adding NER (BIO tags) as keys and the counts as weights ##
#######################################################################

#         for w in spacytok:           
#             ent = [w.ent_iob_]  #extracts the labeled entity of the token
#             try:
#                 localFeat[ent[0]] += (1.0/len(spacytok)) #Tries adding the weights to existing BIO tags
#             except KeyError:
#                 localFeat[ent[0]] = (1.0/len(spacytok)) #adds the bio tag and its weight if its not there already
#             try:
#                 i = featureDict[ent[0]] #Tries adding the weights to existing tokens that's been calculated
#             except KeyError:
#                 i = len(featureDict) + 1  #adds 1 to length of feature dict
#                 featureDict[ent[0]] = i #adds the word and its number of frequency


########## 5. Adding word as keys and NER (BIO tags) as data ##########
#######################################################################

#         for w in spacytok:           
#             ent = [w.ent_iob_]  #extracts the labeled entity of the token         
#             localFeat[w] = (ent[0]) #adds the bio tag and its weight if its not there already
          
                
########## 6. Adding dependency (dep) as keys and freq as weight ######
#######################################################################

#         for d in spacytok:
            
#             ent = [d.dep_]  #extracts the labeled entity of the token
#             try:
#                 localFeat[ent[0]] += (1.0/len(spacytok)) #Tries adding the weights to existing BIO tags
#             except KeyError:
#                 localFeat[ent[0]] = (1.0/len(spacytok)) #adds the bio tag and its weight if its not there already
#             try:
#                 i = featureDict[ent[0]] #Tries adding the weights to existing tokens that's been calculated
#             except KeyError:
#                 i = len(featureDict) + 1  #adds 1 to length of feature dict
#                 featureDict[ent[0]] = i #adds the word and its number of frequency
 
    
########## 7. Adding word as keys and dependency (dep) as data ########
#######################################################################

#         for d in spacytok:
#             ent = [d.dep_]     #extracts the dependency tag         
#             localFeat[d] = (ent[0]) #adds word as keys and dependency (dep) as data


    return (localFeat)

In [25]:
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    #pipeline =  Pipeline([('clf', MultinomialNB())])
     
    return SklearnClassifier(pipeline).train(trainData)

In [26]:
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [27]:
precisionAvg = []
recallAvg = []
fscoreAvg = []


def crossValidate(dataset, folds):
    shuffle(dataset)                 # shuffles the dataset
    onefold = int(len(dataset)/folds) # Calculating the length for one fold


    fold1 = dataset[0:onefold]           # splitting the dataset according to the size of folds. In this case 10 folds.
    fold2 = dataset[onefold:onefold*2]
    fold3 = dataset[onefold*2:onefold*3]
    fold4 = dataset[onefold*3:onefold*4]
    fold5 = dataset[onefold*4:onefold*5]
    fold6 = dataset[onefold*5:onefold*6]
    fold7 = dataset[onefold*6:onefold*7]
    fold8 = dataset[onefold*7:onefold*8]
    fold9 = dataset[onefold*8:onefold*9]
    fold10 = dataset[onefold*9:]

    #The following part maps the fold sections to 10 different crossvalidation iteration, where each iteration,
    #the test fold changes.

    
    cvfoldtest[0] = fold1
    cvfoldtrain[0] = fold2 + fold3+fold4+ fold5+ fold6+fold7+ fold8+ fold9+ fold10

    cvfoldtest[1] = fold2
    cvfoldtrain[1] = fold1 + fold3+fold4+ fold5+ fold6+fold7+ fold8+ fold9+ fold10 

    cvfoldtest[2] = fold3
    cvfoldtrain[2] = fold1 + fold2+fold4+ fold5+ fold6+fold7+ fold8+ fold9+ fold10 

    cvfoldtest[3] = fold4
    cvfoldtrain[3] = fold1 + fold2+fold3+ fold5+ fold6+fold7+ fold8+ fold9+ fold10 

    cvfoldtest[4] = fold5
    cvfoldtrain[4] = fold1 + fold2+fold3+ fold4+ fold6+fold7+ fold8+ fold9+ fold10 

    cvfoldtest[5] = fold6
    cvfoldtrain[5] = fold1 + fold2+fold3+ fold4+ fold5+fold7+ fold8+ fold9+ fold10 

    cvfoldtest[6] = fold7
    cvfoldtrain[6] = fold1 + fold2+fold3+ fold4+ fold5+fold6+ fold8+ fold9+ fold10 

    cvfoldtest[7] = fold8
    cvfoldtrain[7] = fold1 + fold2+fold3+ fold4+ fold5+fold6+ fold7+ fold9+ fold10 

    cvfoldtest[8] = fold9
    cvfoldtrain[8] = fold1 + fold2+fold3+ fold4+ fold5+fold6+ fold7+ fold8+ fold10 

    cvfoldtest[9] = fold10
    cvfoldtrain[9] = fold1 + fold2+fold3+ fold4+ fold5+fold6+ fold7+ fold8+ fold9

    for i in range(folds):   
     
        classifier = trainClassifier(cvfoldtrain[i])        # train the classifier
        testTrue = [t[1] for t in cvfoldtest[i]]                  # get the ground-truth labels from the data
        testPred = predictLabels(cvfoldtest[i], classifier)
        finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted')
        print("Crossvalidation iteration number: ", i+1)
        print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])
    
        precisionAvg.append(finalScores[0])  #appends data to object for average calculation
        recallAvg.append(finalScores[1])
        fscoreAvg.append(finalScores[2])
    
    finalPrecisionAvg = sum(precisionAvg)/10
    finalRecallAvg = sum(recallAvg)/10
    finalFscoreAvg = sum(fscoreAvg)/10
    
    print("10 fold crossvalidation results: ")
    print ("The average precision score is ", finalPrecisionAvg ) #Result for precision average 
    print  ("The average recall score is ", finalRecallAvg ) #Result for recall average
    print ("The average fscore score is ", finalFscoreAvg ) #Result for fscore average
    
    return finalPrecisionAvg, finalRecallAvg, finalFscoreAvg
    

In [28]:
##### Main ####

rawData = []          # the filtered data from the dataset file (should be 21000 samples)
rawTest = []          # Raw test data
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)


# references to the data files
reviewPath = 'training.csv'
rawtestset = 'test.csv'
# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
loadTest(rawtestset)

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(1)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

print('length train', len(trainData))
print('length test', len(testData))

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10113 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 10113 rawData, 10112 trainData, 1124 testData
Training Samples: 
10112
Features: 
5906
length train 10112
length test 1124


In [29]:
onefold = int(len(trainData)/10)
fold1 = trainData[0:onefold]
fold2 = trainData[onefold:onefold*2]
fold3 = trainData[onefold*2:onefold*3]
fold4 = trainData[onefold*3:onefold*4]
fold5 = trainData[onefold*4:onefold*5]
fold6 = trainData[onefold*5:onefold*6]
fold7 = trainData[onefold*6:onefold*7]
fold8 = trainData[onefold*7:onefold*8]
fold9 = trainData[onefold*8:onefold*9]
fold10 = trainData[onefold*9:]

#print(fold1)
cvfoldtest = fold1
cvfoldtrain = fold2 + fold3+fold4+ fold5+ fold6+fold7+ fold8+ fold9+ fold10

In [34]:
########## 10-Fold Cross-validation ###############

# cvResults = crossValidate(trainData, 10) #calling CV function
# print(cvResults) 

In [33]:
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    #print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels~
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])

Training Classifier...
Done training!
Precision: 0.588599
Recall: 0.587189
F Score:0.587580


In [32]:
classReport= classification_report(testTrue, testPred)
print(classReport)

              precision    recall  f1-score   support

      female       0.56      0.58      0.57       526
        male       0.62      0.59      0.60       598

    accuracy                           0.59      1124
   macro avg       0.59      0.59      0.59      1124
weighted avg       0.59      0.59      0.59      1124



# Results (Fine tune on validation set)

In [None]:
##### 1. Classification report for using weighted word count (Benchmark) #######
################################################################################

#               precision    recall  f1-score   support

#       female       0.55      0.53      0.54      1017
#         male       0.54      0.56      0.55      1006

#     accuracy                           0.55      2023
#    macro avg       0.55      0.55      0.55      2023
# weighted avg       0.55      0.55      0.55      2023

##Comments
##As a benchmark, I'm using a simple a weighted word count for in the toFeatureVector as I did in assignment 1.
##The result is decent, with a macro average of 55.

####################### Cross validation result ################################
################################################################################

# 10 fold crossvalidation results: 
# The average precision score is  0.5634858116939093
# The average recall score is  0.5625463535228677
# The average fscore score is  0.5620891366442106

##Comments
##Using crossvalidation on the same feature vector produce a result of 0.562
##The avg fscore increases after using crossvalidation on the dataset

In [None]:
############# 2. Classification report for using Postags #######################
################################################################################

#               precision    recall  f1-score   support

#       female       0.52      0.50      0.51      1017
#         male       0.51      0.54      0.53      1006

#     accuracy                           0.52      2023
#    macro avg       0.52      0.52      0.52      2023
# weighted avg       0.52      0.52      0.52      2023

##Comments
##Using the postags alone in the toFeatureVector function reduces the macro avg to 52

####################### Cross validation result ################################
################################################################################

# 10 fold crossvalidation results: 
# The average precision score is  0.5184037732730571
# The average recall score is  0.5165636588380718
# The average fscore score is  0.5152655555112817

##Comments
##Using crossvalidation on the same feature vector produce a result of 0.515
##The avg fscore remains unchanged even after using crossvalidation

In [None]:
####### 3. Classification report for weighted word count and Postags ###########
################################################################################

#               precision    recall  f1-score   support

#       female       0.55      0.54      0.54      1017
#         male       0.54      0.56      0.55      1006

#     accuracy                           0.55      2023
#    macro avg       0.55      0.55      0.55      2023
# weighted avg       0.55      0.55      0.55      2023

##Comments
##Combining the postags and the weighted word counts doesn't improve the macro avg.
##The result remains at 55 using this method.

####################### Cross validation result ################################
################################################################################

# 10 fold crossvalidation results: 
# The average precision score is  0.5629531756078207
# The average recall score is  0.5610630407911001
# The average fscore score is  0.560739974495919

##Comments
##Using crossvalidation on the same feature vector produces a result of 0.56
##The avg fscore increased after using crossvalidation

In [None]:
####### 4. Classification report for NER (BIO-tags) as features ################
################################################################################

#               precision    recall  f1-score   support

#       female       0.50      0.16      0.24      1017
#         male       0.50      0.84      0.62      1006

#     accuracy                           0.50      2023
#    macro avg       0.50      0.50      0.43      2023
# weighted avg       0.50      0.50      0.43      2023

##Comments
##By using the bio tags as weighted features, the macro avg is 0.43
##The tags on its own is not capable of improving the model

####################### Cross validation result ################################
################################################################################

# 10 fold crossvalidation results: 
# The average precision score is  0.5141275912199073
# The average recall score is  0.5076637824474659
# The average fscore score is  0.4440490339322534

##Comments
##However, by using crossvalidation, it did increase the performance by 0.01

In [None]:
####### 5. Classification report for dependency features as keys and weight ####
################################################################################

#               precision    recall  f1-score   support

#       female       0.51      0.45      0.48      1017
#         male       0.50      0.55      0.53      1006

#     accuracy                           0.50      2023
#    macro avg       0.50      0.50      0.50      2023
# weighted avg       0.50      0.50      0.50      2023

##Comments
##By using syntactic dependency as features, the macro avg 0.50

####################### Cross validation result ################################
################################################################################

# 10 fold crossvalidation results: 
# The average precision score is  0.502701695421722
# The average recall score is  0.5006180469715698
# The average fscore score is  0.498880598781958

##Comments
##The performance of the model using crossvalidation didnt increae either

In [None]:
####### 6. Classification report for dependency (word as keys and dep as data) #
################################################################################

#               precision    recall  f1-score   support

#       female       0.54      0.58      0.56      1017
#         male       0.54      0.50      0.52      1006

#     accuracy                           0.54      2023
#    macro avg       0.54      0.54      0.54      2023
# weighted avg       0.54      0.54      0.54      2023

##Comments
##Using syntactic dependency alone, the model managed to get a macro avg of 0.54

####################### Cross validation result ################################
################################################################################
# 10 fold crossvalidation results: 
# The average precision score is  0.5482369429454279
# The average recall score is  0.5474660074165636
# The average fscore score is  0.5474281453967389

##Comments
##Cross validation shows that across every slice, the model managed to obtain
##a result of of almost 0.55

In [None]:
############### 7. Combining Pos tags and Dependency   #########################
################################################################################

#               precision    recall  f1-score   support

#       female       0.55      0.57      0.56      1017
#         male       0.55      0.53      0.54      1006

#     accuracy                           0.55      2023
#    macro avg       0.55      0.55      0.55      2023
# weighted avg       0.55      0.55      0.55      2023

##Comments
##By combining postags and dependency tags as features, the macro avg is 0.55

####################### Cross validation result ################################
################################################################################

# 10 fold crossvalidation results: 
# The average precision score is  0.5432027884531633
# The average recall score is  0.5428924598269468
# The average fscore score is  0.5426211409428975

##Comments
##However, the crossvalidation achieved an avg of 0.54 score

# Results (on test set)

In [None]:
####### 3. Classification report for weighted word count and Postags ###########
################################################################################

#               precision    recall  f1-score   support

#       female       0.56      0.58      0.57       526
#         male       0.62      0.59      0.60       598

#     accuracy                           0.59      1124
#    macro avg       0.59      0.59      0.59      1124
# weighted avg       0.59      0.59      0.59      1124

################ precision_recall_fscore_support ###############################

# Precision: 0.588599
# Recall: 0.587189
# F Score:0.587580

## After testing different combinations of features (postags, biotags, dependency, word count), the best combination for me
## was the word count and postags together.

## Using the best combination of features (weighted word count and Postags) which obtained 0.56 on the validation set,
## it managed to obtain 0.59 on the test set.

## for a more precise number, the precision_recall_fscore_support function outputs the result to 6 decimal places.
## The f-score is actually 0.587
