In [1]:
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus import sentiwordnet as swn
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# change to train data 
comments_train = pd.read_csv('cleaned_train_data.csv')
comments_test = pd.read_csv('cleaned_test_data.csv')

In [3]:
en_comments_train = comments_train[comments_train['Language'] =='en']
en_comments_test = comments_test[comments_test['Language'] =='en']

In [4]:
en_comments_train

Unnamed: 0,Comment,Language,Sentiment
0,curve grade hehehehe,en,1
1,master lesson perfectly,en,1
2,terror but teach well,en,1
3,unenthusiastic barely understand teach,en,-1
4,not applicable,en,0
...,...,...,...
199,rapport student strong dress go extra mile tea...,en,1
200,bias,en,-1
201,fair grade student,en,1
353,engage,en,1


## POS Tagging

In [5]:
en_comments_train['Comment'] = en_comments_train['Comment'].apply(str)
en_comments_test['Comment'] = en_comments_test['Comment'].apply(str)

pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

pd.set_option("display.max_rows", None, "display.max_columns", None)
en_comments_train['POS tagged'] = en_comments_train['Comment'].apply(token_stop_pos)
en_comments_test['POS tagged'] = en_comments_test['Comment'].apply(token_stop_pos)
en_comments_train

Unnamed: 0,Comment,Language,Sentiment,POS tagged
0,curve grade hehehehe,en,1,"[(curve, n), (grade, n), (hehehehe, n)]"
1,master lesson perfectly,en,1,"[(master, n), (lesson, n), (perfectly, r)]"
2,terror but teach well,en,1,"[(terror, n), (but, None), (teach, n), (well, r)]"
3,unenthusiastic barely understand teach,en,-1,"[(unenthusiastic, a), (barely, r), (understand..."
4,not applicable,en,0,"[(not, r), (applicable, a)]"
5,favoritism student,en,-1,"[(favoritism, n), (student, n)]"
6,bore monotonous voice,en,-1,"[(bore, r), (monotonous, a), (voice, n)]"
7,best teacher enjoy class,en,1,"[(best, a), (teacher, n), (enjoy, n), (class, n)]"
8,tell explain clearly precisely day defense pro...,en,-1,"[(tell, n), (explain, v), (clearly, r), (preci..."
9,interact student lecture,en,1,"[(interact, a), (student, n), (lecture, n)]"


In [6]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos: 
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:  
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew
    
en_comments_train['Lemma'] = en_comments_train['POS tagged'].apply(lemmatize)
en_comments_test['Lemma'] = en_comments_test['POS tagged'].apply(lemmatize)
en_comments_train.head()

Unnamed: 0,Comment,Language,Sentiment,POS tagged,Lemma
0,curve grade hehehehe,en,1,"[(curve, n), (grade, n), (hehehehe, n)]",curve grade hehehehe
1,master lesson perfectly,en,1,"[(master, n), (lesson, n), (perfectly, r)]",master lesson perfectly
2,terror but teach well,en,1,"[(terror, n), (but, None), (teach, n), (well, r)]",terror but teach well
3,unenthusiastic barely understand teach,en,-1,"[(unenthusiastic, a), (barely, r), (understand...",unenthusiastic barely understand teach
4,not applicable,en,0,"[(not, r), (applicable, a)]",not applicable


In [7]:
def sentiwordnetanalysis(pos_data):
    pos_sentiment = 0
    neg_sentiment = 0
    word_pos_sentiment = 0
    word_neg_sentiment = 0
    tokens_count = 0
    
    for word, pos in pos_data:
        if not pos:
            continue
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        if not lemma:
            continue
        
        synsets = wordnet.synsets(lemma, pos=pos)
        if not synsets:
            continue


        word_pos_sentiment = 0
        word_neg_sentiment = 0
        synset_count = len(wordnet.synsets(lemma, pos=pos))
        if synset_count == 1:
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            word_pos_sentiment += swn_synset.pos_score()
            word_neg_sentiment += swn_synset.neg_score()
        else:
            for x in range(synset_count):
                synset = synsets[x]
                swn_synset = swn.senti_synset(synset.name())
                word_pos_sentiment += swn_synset.pos_score()
                word_neg_sentiment += swn_synset.neg_score()
            
        pos_sentiment += word_pos_sentiment/synset_count
        neg_sentiment += word_neg_sentiment/synset_count
            
        tokens_count += 1
        
    if not tokens_count:
        return 0
    
    if pos_sentiment>neg_sentiment:
        return 1
    if pos_sentiment<neg_sentiment:
        return -1
    else:
        return 0

en_comments_train['Lexicon Sentiment'] = en_comments_train['POS tagged'].apply(sentiwordnetanalysis)
en_comments_test['Lexicon Sentiment'] = en_comments_test['POS tagged'].apply(sentiwordnetanalysis)
en_comments_train.head()

Unnamed: 0,Comment,Language,Sentiment,POS tagged,Lemma,Lexicon Sentiment
0,curve grade hehehehe,en,1,"[(curve, n), (grade, n), (hehehehe, n)]",curve grade hehehehe,1
1,master lesson perfectly,en,1,"[(master, n), (lesson, n), (perfectly, r)]",master lesson perfectly,1
2,terror but teach well,en,1,"[(terror, n), (but, None), (teach, n), (well, r)]",terror but teach well,1
3,unenthusiastic barely understand teach,en,-1,"[(unenthusiastic, a), (barely, r), (understand...",unenthusiastic barely understand teach,-1
4,not applicable,en,0,"[(not, r), (applicable, a)]",not applicable,-1


In [17]:
en_comments_train.drop(["POS tagged", "Lemma"], axis='columns', inplace=True) #drops columns
en_comments_test.drop(["POS tagged", "Lemma"], axis='columns', inplace=True) #drops columns
en_comments_train

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Comment,Language,Sentiment,Lexicon Sentiment
0,curve grade hehehehe,en,1,1
1,master lesson perfectly,en,1,1
2,terror but teach well,en,1,1
3,unenthusiastic barely understand teach,en,-1,-1
4,not applicable,en,0,-1
5,favoritism student,en,-1,0
6,bore monotonous voice,en,-1,-1
7,best teacher enjoy class,en,1,1
8,tell explain clearly precisely day defense pro...,en,-1,1
9,interact student lecture,en,1,-1


In [19]:
en_comments_train.to_csv('en_comments_lexicon_tags_train.csv', index=False)

In [20]:
en_comments_test.to_csv('en_comments_lexicon_tags_test.csv', index=False)

## Testing Accuracy

In [21]:
en_train_test = pd.concat([en_comments_train, en_comments_test])
en_train_test

Unnamed: 0,Comment,Language,Sentiment,Lexicon Sentiment
0,curve grade hehehehe,en,1,1
1,master lesson perfectly,en,1,1
2,terror but teach well,en,1,1
3,unenthusiastic barely understand teach,en,-1,-1
4,not applicable,en,0,-1
5,favoritism student,en,-1,0
6,bore monotonous voice,en,-1,-1
7,best teacher enjoy class,en,1,1
8,tell explain clearly precisely day defense pro...,en,-1,1
9,interact student lecture,en,1,-1


In [22]:
accuracy_lexicon = accuracy_score(en_train_test['Sentiment'], en_train_test['Lexicon Sentiment'])
correct_lexicon = accuracy_score(en_train_test['Sentiment'], en_train_test['Lexicon Sentiment'], normalize=False)

print('Lexicon Sentiments')
print('Accuracy Score: ', accuracy_lexicon)
print('Count of Correctly Classified Comments: ', correct_lexicon, '/ 231')

Lexicon Sentiments
Accuracy Score:  0.5974025974025974
Count of Correctly Classified Comments:  138 / 231


In [23]:
print(classification_report(en_train_test['Sentiment'], en_train_test['Lexicon Sentiment']))

              precision    recall  f1-score   support

          -1       0.61      0.45      0.52        96
           0       0.20      0.24      0.22        21
           1       0.66      0.79      0.72       114

    accuracy                           0.60       231
   macro avg       0.49      0.49      0.49       231
weighted avg       0.60      0.60      0.59       231

