In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
comments_train = pd.read_csv('cleaned_train_data.csv')
comments_test = pd.read_csv('cleaned_test_data.csv')

In [3]:
tl_comments_train = comments_train[comments_train['Language'] =='tl']
tl_comments_test = comments_test[comments_test['Language'] =='tl']

In [4]:
tl_comments_train

Unnamed: 0,Comment,Language,Sentiment
202,manyak,tl,-1
203,marunong makisama klase,tl,1
205,galing niyo po mangbara sana ganun po pagtuturo,tl,-1
206,bobo mag turo alam mong manyak makatingin pala...,tl,-1
207,nakikinig opinyon,tl,1
...,...,...,...
379,galit babae,tl,-1
380,mahina boses walang gamit tinuturo trabaho,tl,-1
381,maganda sulat,tl,1
382,laging maaga klase kahangahanga galing pagtuturo,tl,1


In [5]:
filcon = pd.read_csv('filcon.csv')
filcon

Unnamed: 0,Part of Speech,English,Filipino,"POS, Fil",Positive Score,Negative Score
0,a,zeroth,0,"a,0",0.000,0.0
1,n,abbe,Abbé,"n,Abbé",0.125,0.0
2,n,activism,Aktibismo,"n,Aktibismo",0.000,0.0
3,a,german,Aleman,"a,Aleman",0.000,0.0
4,n,german,Aleman,"n,Aleman",0.000,0.0
...,...,...,...,...,...,...
22375,n,genet,zhena,"n,zhena",0.000,0.0
22376,n,zinnia,zinya,"n,zinya",0.000,0.0
22377,n,zoophyte,zoopayt,"n,zoopayt",0.000,0.0
22378,a,withered,zowto,"a,zowto",0.000,0.0


In [6]:
filcondf = pd.DataFrame(filcon, columns= ['Filipino', 'Positive Score', 'Negative Score'])
filcondf

Unnamed: 0,Filipino,Positive Score,Negative Score
0,0,0.000,0.0
1,Abbé,0.125,0.0
2,Aktibismo,0.000,0.0
3,Aleman,0.000,0.0
4,Aleman,0.000,0.0
...,...,...,...
22375,zhena,0.000,0.0
22376,zinya,0.000,0.0
22377,zoopayt,0.000,0.0
22378,zowto,0.000,0.0


In [7]:
import nltk
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
def tokenize_text(text):   
    return w_tokenizer.tokenize(text)

tl_comments_train['Tokenized Comments'] = comments_train['Comment'].apply(lambda x: tokenize_text(x))
tl_comments_test['Tokenized Comments'] = comments_test['Comment'].apply(lambda x: tokenize_text(x))
tl_comments_train

Unnamed: 0,Comment,Language,Sentiment,Tokenized Comments
202,manyak,tl,-1,[manyak]
203,marunong makisama klase,tl,1,"[marunong, makisama, klase]"
205,galing niyo po mangbara sana ganun po pagtuturo,tl,-1,"[galing, niyo, po, mangbara, sana, ganun, po, ..."
206,bobo mag turo alam mong manyak makatingin pala...,tl,-1,"[bobo, mag, turo, alam, mong, manyak, makating..."
207,nakikinig opinyon,tl,1,"[nakikinig, opinyon]"
...,...,...,...,...
379,galit babae,tl,-1,"[galit, babae]"
380,mahina boses walang gamit tinuturo trabaho,tl,-1,"[mahina, boses, walang, gamit, tinuturo, trabaho]"
381,maganda sulat,tl,1,"[maganda, sulat]"
382,laging maaga klase kahangahanga galing pagtuturo,tl,1,"[laging, maaga, klase, kahangahanga, galing, p..."


In [8]:
def filconanalysis(comment):
    pos_sentiment = 0.0
    neg_sentiment = 0.0

    word_pos_sentiment = 0.0
    word_neg_sentiment = 0.0

    total = 0
    for word in comment:
        total = 0.0
        word_pos_sentiment = 0
        word_neg_sentiment = 0
        for row in filcondf.itertuples():
            if word == row[1]:
                total += 1
                word_pos_sentiment += float(row[2])
                word_neg_sentiment += float(row[3])

        if total != 0:
            pos_sentiment += word_pos_sentiment/total
            neg_sentiment += word_neg_sentiment/total

    if pos_sentiment > neg_sentiment:
        return 1
    if pos_sentiment < neg_sentiment:
        return -1
    else:
        return 0

tl_comments_train['Lexicon Sentiment'] = tl_comments_train['Tokenized Comments'].apply(filconanalysis)
tl_comments_test['Lexicon Sentiment'] = tl_comments_test['Tokenized Comments'].apply(filconanalysis)
tl_comments_train

Unnamed: 0,Comment,Language,Sentiment,Tokenized Comments,Lexicon Sentiment
202,manyak,tl,-1,[manyak],0
203,marunong makisama klase,tl,1,"[marunong, makisama, klase]",1
205,galing niyo po mangbara sana ganun po pagtuturo,tl,-1,"[galing, niyo, po, mangbara, sana, ganun, po, ...",1
206,bobo mag turo alam mong manyak makatingin pala...,tl,-1,"[bobo, mag, turo, alam, mong, manyak, makating...",1
207,nakikinig opinyon,tl,1,"[nakikinig, opinyon]",0
...,...,...,...,...,...
379,galit babae,tl,-1,"[galit, babae]",-1
380,mahina boses walang gamit tinuturo trabaho,tl,-1,"[mahina, boses, walang, gamit, tinuturo, trabaho]",-1
381,maganda sulat,tl,1,"[maganda, sulat]",1
382,laging maaga klase kahangahanga galing pagtuturo,tl,1,"[laging, maaga, klase, kahangahanga, galing, p...",1


In [9]:
tl_comments_train.drop(["Tokenized Comments"], axis='columns', inplace=True) #drops columns
tl_comments_test.drop(["Tokenized Comments"], axis='columns', inplace=True) #drops columns
tl_comments_train

Unnamed: 0,Comment,Language,Sentiment,Lexicon Sentiment
202,manyak,tl,-1,0
203,marunong makisama klase,tl,1,1
205,galing niyo po mangbara sana ganun po pagtuturo,tl,-1,1
206,bobo mag turo alam mong manyak makatingin pala...,tl,-1,1
207,nakikinig opinyon,tl,1,0
...,...,...,...,...
379,galit babae,tl,-1,-1
380,mahina boses walang gamit tinuturo trabaho,tl,-1,-1
381,maganda sulat,tl,1,1
382,laging maaga klase kahangahanga galing pagtuturo,tl,1,1


In [19]:
tl_comments_train.to_csv('tl_comments_lexicon_tags_train.csv', index=False)

In [20]:
tl_comments_test.to_csv('tl_comments_lexicon_tags_test.csv', index=False)

## Testing Accuracy

In [23]:
tl_train_test = pd.concat([tl_comments_train, tl_comments_test])
tl_train_test

Unnamed: 0,Comment,Language,Sentiment,Lexicon Sentiment
202,manyak,tl,-1,0
203,marunong makisama klase,tl,1,1
205,galing niyo po mangbara sana ganun po pagtuturo,tl,-1,1
206,bobo mag turo alam mong manyak makatingin pala...,tl,-1,1
207,nakikinig opinyon,tl,1,0
...,...,...,...,...
51,patas magbigay grado,tl,1,1
54,maitim balat,tl,0,0
55,masarap pakinggan yung boses,tl,1,1
58,nakaupo lang sya madals pagnagtututro,tl,-1,0


In [24]:
accuracy_lexicon = accuracy_score(tl_train_test['Sentiment'], tl_train_test['Lexicon Sentiment'])
correct_lexicon = accuracy_score(tl_train_test['Sentiment'], tl_train_test['Lexicon Sentiment'], normalize=False)

print('Lexicon Sentiments')
print('Accuracy Score: ', accuracy_lexicon)
print('Count of Correctly Classified Comments: ', correct_lexicon, '/ 231')

Lexicon Sentiments
Accuracy Score:  0.5263157894736842
Count of Correctly Classified Comments:  80 / 231


In [25]:
print(classification_report(tl_train_test['Sentiment'], tl_train_test['Lexicon Sentiment']))

              precision    recall  f1-score   support

          -1       0.79      0.44      0.57        86
           0       0.12      0.50      0.20         8
           1       0.54      0.66      0.59        58

    accuracy                           0.53       152
   macro avg       0.48      0.53      0.45       152
weighted avg       0.66      0.53      0.56       152

