In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, numpy as np, textblob, string
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


Create comment objects to hold desired information.<br>
Create TextBlobs of all of the textual comments. This is useful for extracting a lot of information on the text, such as sentiment, nouns, verbs,word and noun frequencies, inflection, spelling corrections, etc.<br>
Part of speech lists, giving each word in the sentence and what it is (i.e. noun, verb, adverb, etc. <br>
Sentiment analysis is performed using VADER which gives positive, negative, neutral and compound scores. The compound score represents the aggregate of the other scores.

In [51]:
# Create a comment object
class Comment:
    def __init__(self, label, text):
        self.label = label
        self.text = text
        self.split_text = text.split()
        self.blob = TextBlob(self.text)
        self.pos = self.blob.tags
        self.analyzer = SentimentIntensityAnalyzer()
        self.sentiment = self.analyzer.polarity_scores(self.text)
        self.words = self.blob.words
        self.sentences = self.blob.sentences
        self.noun_phrases = self.blob.noun_phrases
        
        length = int(len(self.words)/2)
        first =""
        second = ""
        for x in range(0,length):
            first += self.words[x]+" "
        for x in range(length, len(self.words)-1):
            second += self.words[x]+" "
        self.firstSent = self.analyzer.polarity_scores(first)
        self.secondSent = self.analyzer.polarity_scores(second)
        
  
    def __iter__(self):
        for i in range [0:len(self.split_text)-1] :
            return(self.split_text[i])
    
    def __next__(self):
        if i < len(self.split_text)-1:
            return self.split_text[i]
        else:
            raise StopIteration  # Done iterating.
        
    def getInfo(self):
        print("Label: " ,self.label," Text: ", self.text, "Sentiment: ",self.sentiment," Noun-phrases"
              , self.noun_phrases,"Parts of speech: ", self.pos)
        
    def getSarc(self):
        length = int(len(self.words)/2)
        first =""
        second = ""
        for x in range(0,length):
            first += self.words[x]+" "
        for x in range(length, len(self.words)-1):
            second += self.words[x]+" "
        firstSent = self.analyzer.polarity_scores(first)
        secondSent = self.analyzer.polarity_scores(second)
        return firstSent, secondSent
        
        

In [52]:
#Check it worked
comment = Comment(1, "Man sometimes it is so great when you get to the shop and realise you have no money")
print(comment.getInfo())
print(comment.getSarc())

Label:  1  Text:  Man sometimes it is so great when you get to the shop and realise you have no money Sentiment:  {'neg': 0.094, 'neu': 0.683, 'pos': 0.224, 'compound': 0.6176}  Noun-phrases [] Parts of speech:  [('Man', 'NN'), ('sometimes', 'VBZ'), ('it', 'PRP'), ('is', 'VBZ'), ('so', 'RB'), ('great', 'JJ'), ('when', 'WRB'), ('you', 'PRP'), ('get', 'VBP'), ('to', 'TO'), ('the', 'DT'), ('shop', 'NN'), ('and', 'CC'), ('realise', 'NN'), ('you', 'PRP'), ('have', 'VBP'), ('no', 'DT'), ('money', 'NN')]
None
({'neg': 0.0, 'neu': 0.604, 'pos': 0.396, 'compound': 0.7384}, {'neg': 0.239, 'neu': 0.761, 'pos': 0.0, 'compound': -0.296})


In [53]:
# Open the file and create comment objects with the label and comment

import csv

raw_comments =[]

with open("DevProject/Project work/Blind test.csv", encoding='utf-8-sig') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if len(row['comment'].split())>3: 
            raw_comments.append(Comment(row['label'],row['comment']))

In [54]:
#Check it worked
print(raw_comments[27].getInfo())
print(raw_comments[27].getSarc())
print(len(raw_comments))

Label:  0  Text:  Wow... That looks nice. Sentiment:  {'neg': 0.0, 'neu': 0.517, 'pos': 0.483, 'compound': 0.4215}  Noun-phrases ['wow'] Parts of speech:  [('Wow', 'NN'), ('That', 'DT'), ('looks', 'VBZ'), ('nice', 'JJ')]
None
({'neg': 0.0, 'neu': 0.208, 'pos': 0.792, 'compound': 0.5859}, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0})
4264


Removing stopwords has little to no effect on the sentiment scores, however it may influence the algorithmg based on the content for detecting sarcasm through common words etc. <br>
All comments of less than 3 words are also removed.

In [59]:
sarc_comments =[]

for comment in raw_comments:
    if comment.label == "1":
        sarc_comments.append(comment)
        
        

In [60]:
print(sarc_comments[107].getInfo())

Label:  1  Text:  but it says "fingerprint", if I take a toeprint it would be a bureaucratic anomaly! Sentiment:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}  Noun-phrases [] Parts of speech:  [('but', 'CC'), ('it', 'PRP'), ('says', 'VBZ'), ('fingerprint', 'NN'), ('if', 'IN'), ('I', 'PRP'), ('take', 'VBP'), ('a', 'DT'), ('toeprint', 'NN'), ('it', 'PRP'), ('would', 'MD'), ('be', 'VB'), ('a', 'DT'), ('bureaucratic', 'JJ'), ('anomaly', 'NN')]
None


In [61]:
count =0
negCount=0
posCount=0
neuCount=0
total = len(sarc_comments)

for comment in sarc_comments:
    if comment.firstSent['neg'] < comment.firstSent['pos']:
        if comment.secondSent['pos'] <comment.secondSent['neg']:
            count+=1
     
    if comment.sentiment['neu'] >0.0:
        neuCount+=1
     
    if comment.sentiment['pos'] >0.0:
        posCount+=1
print("Number of sarc comments ", total,". Mixed pos and neg ",count, "neg ",negCount, " pos ", posCount, " neu ", neuCount)
print("Percentages: Mixed pos and neg ",(count/total)*100, "neg ",(negCount/total)*100, 
      " pos ", (posCount/total)*100, " neu ", (neuCount/total)*100)
count =0
negCount=0
posCount=0
neuCount=0
total = len(raw_comments)

for comment in raw_comments:
    if comment.firstSent['neg'] < comment.firstSent['pos']:
        if comment.secondSent['pos'] <comment.secondSent['neg']:
            count+=1
     
    if comment.sentiment['neu'] >0.0:
        neuCount+=1
     
    if comment.sentiment['pos'] >0.0:
        posCount+=1
print()
print("Number of comments ", len(raw_comments),". Mixed pos and neg ",count, "neg ",negCount, " pos ", posCount, " neu ", neuCount)
print("Percentages: Mixed pos and neg ",(count/total)*100, "neg ",(negCount/total)*100, 
      " pos ", (posCount/total)*100, " neu ", (neuCount/total)*100)

Number of sarc comments  2428 . Mixed pos and neg  120 neg  0  pos  1180  neu  2428
Percentages: Mixed pos and neg  4.942339373970346 neg  0.0  pos  48.59967051070841  neu  100.0



Number of comments  4264 . Mixed pos and neg  187 neg  0  pos  2037  neu  4262
Percentages: Mixed pos and neg  4.385553470919325 neg  0.0  pos  47.77204502814259  neu  99.953095684803


There appears no overall correlation in the sample between sentiment and sarcasm from the sample except that all contain neutral sentiment, however this is true for the entire sample so does not distinctly apply to sarcastic comments.

In [9]:
alt_nouns = []
from textblob.np_extractors import ConllExtractor

extractor = ConllExtractor
thisblob = TextBlob(raw_comments[100].text, np_extractor=extractor)
print(thisblob.noun_phrases)

ValueError: np_extractor must be an instance of BaseNPExtractor

In [71]:
ratio =[0,0]
count =0
for comment in raw_comments:
    count+=1
    thisrat = [0,0]
    for i in range (0, len(comment.text)-1):
        char = comment.text[i]
        if char.isalpha():
            ratio[0]+=1
        else:
            ratio[1]+=1
    
print ("Letters:",ratio[0]/count, " Symbols: ",ratio[1]/count)


for comment in sarc_comments:
    count+=1
    thisrat = [0,0]
    for i in range (0, len(comment.text)-1):
        char = comment.text[i]
        if char.isalpha():
            ratio[0]+=1
        else:
            ratio[1]+=1
    
print ("Letters:",ratio[0]/count, " Symbols: ",ratio[1]/count)
           

Letters: 52.01383677298311  Symbols:  12.974437148217635
Letters: 50.55215182307232  Symbols:  12.57531380753138


There appears to be no difference in the use of symbols in the sample

In [None]:
for word in raw_comments:
    