In [1]:
import pandas as pd
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import warnings
warnings.filterwarnings('ignore')

### Bi-gram Model Training

In [2]:
one_meelyun_sentences = pd.read_csv('one_meelyun_sentences.csv', header=None)

In [3]:
one_meelyun_sentences.columns = ["sentences"] #sets column name to "sentences"
one_meelyun_sentences_train = one_meelyun_sentences.sample(frac = 0.5, random_state = 2) #halves the dataset
one_meelyun_sentences_train['sentences'] = one_meelyun_sentences_train['sentences'].str.lower() #makes all sentences lowercase
one_meelyun_sentences_train['sentences'] = one_meelyun_sentences_train['sentences'].str.replace('\d+', '', regex=True) #removes numbers
one_meelyun_sentences_train['sentences'] = one_meelyun_sentences_train['sentences'].str.replace('[^\w\s]', ' ', regex=True) #removes punctuation marks

In [4]:
one_meelyun_sentences_train

Unnamed: 0,sentences
309190,these are the most accurate and precise pipettes
240953,this is followed by highschool years
609687,several new plants were introduced to europe i...
757042,others work steadily for a week or a month and...
788110,colossal cyclone swirling near martian north p...
...,...
306066,over were captured including general bevilaq...
396337,the aircraft is now under long term restoratio...
855606,republished in part by dover in
558290,he sat there and storyboarded the whole movie ...


In [5]:
def basic_clean(text):
  """
  A simple function to clean up the data. All the words that
  are not designated as a stop word is then lemmatized after
  encoding and basic regex parsing are performed.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english')
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [6]:
words = basic_clean(''.join(str(one_meelyun_sentences_train['sentences'].tolist())))

In [7]:
bigram_model = (pd.DataFrame(nltk.ngrams(words, 2))) # makes the bigrams (2 cols)
bigram_model['bigrams'] = bigram_model.iloc[:, 0] + ' ' + bigram_model.iloc[:, 1] # combines col1 col2 to one col
bigram_model = bigram_model.rename({0: "x", 1: "y"}, axis='columns') # renames col1 col2
bigram_model.drop(["x", "y"], axis='columns', inplace=True) #drops col1 col2

In [8]:
bigram_model.drop_duplicates(inplace=True) #drops duplicate rows
bigram_model

Unnamed: 0,bigrams
0,accurate precise
1,precise pipette
2,pipette followed
3,followed highschool
4,highschool year
...,...
6521824,meeting obiang
6521825,obiang extremely
6521827,pleased hopeful
6521828,hopeful relationship


### Unigram Model

In [9]:
unigram_model = pd.read_csv('unigram.csv', header=None)
unigram_model.rename(columns = {0:'unigrams'}, inplace = True)
unigram_model

Unnamed: 0,unigrams
0,the
1,of
2,and
3,to
4,a
...,...
9995,varieties
9996,arbor
9997,mediawiki
9998,configurations


### Bigram and Unigram Scoring

get bigrams of the comments

In [10]:
# separate comments by language
comments_train = pd.read_csv('cleaned_train_data.csv')
comments_test = pd.read_csv('cleaned_test_data.csv')

tlen_comments_train = comments_train[comments_train['Language'] =='tlen']
tlen_comments_test = comments_test[comments_test['Language'] =='tlen']

tlen_comments_train

Unnamed: 0,Comment,Language,Sentiment
110,favorite line competitive guys,tlen,1
129,puro freecut hehehe saktong freecut lang,tlen,-1
131,wala akong tiwala opinion not supported claims...,tlen,-1
133,knowledgeable maraming visual aids slides,tlen,1
153,mahilig phrases play ear word cascade :),tlen,0
...,...,...,...
660,malimit siyang magtanong bagsak recitation,tlen,-1
661,tas quiz multiple choice kaso wala projector i...,tlen,-1
662,lagi sineset mood classroom ambiance,tlen,1
663,malabo magbigay requirements,tlen,-1


In [11]:
tlen_comments_train['Comment'] = tlen_comments_train['Comment'].str.replace(":\)", "h1a2p3p4y5", regex=True)
tlen_comments_train['Comment'] = tlen_comments_train['Comment'].str.replace(":\(", "s6a7d8", regex=True)

tlen_comments_test['Comment'] = tlen_comments_test['Comment'].str.replace(":\)", "h1a2p3p4y5", regex=True)
tlen_comments_test['Comment'] = tlen_comments_test['Comment'].str.replace(":\(", "s6a7d8", regex=True)

In [12]:
# gets the bigrams of the comment
def comment_bigrams(text):
    n_grams = ngrams(word_tokenize(text), 2)
    return [ ' '.join(grams) for grams in n_grams]     

In [13]:
tlen_comments_train['Bigrams'] = tlen_comments_train['Comment'].apply(comment_bigrams)
tlen_comments_test['Bigrams'] = tlen_comments_test['Comment'].apply(comment_bigrams)
tlen_comments_train

Unnamed: 0,Comment,Language,Sentiment,Bigrams
110,favorite line competitive guys,tlen,1,"[favorite line, line competitive, competitive ..."
129,puro freecut hehehe saktong freecut lang,tlen,-1,"[puro freecut, freecut hehehe, hehehe saktong,..."
131,wala akong tiwala opinion not supported claims...,tlen,-1,"[wala akong, akong tiwala, tiwala opinion, opi..."
133,knowledgeable maraming visual aids slides,tlen,1,"[knowledgeable maraming, maraming visual, visu..."
153,mahilig phrases play ear word cascade h1a2p3p4y5,tlen,0,"[mahilig phrases, phrases play, play ear, ear ..."
...,...,...,...,...
660,malimit siyang magtanong bagsak recitation,tlen,-1,"[malimit siyang, siyang magtanong, magtanong b..."
661,tas quiz multiple choice kaso wala projector i...,tlen,-1,"[tas quiz, quiz multiple, multiple choice, cho..."
662,lagi sineset mood classroom ambiance,tlen,1,"[lagi sineset, sineset mood, mood classroom, c..."
663,malabo magbigay requirements,tlen,-1,"[malabo magbigay, magbigay requirements]"


In [14]:
def bigram_unigram_scoring(comment_bigrams, comment, language):    
        
    #finds if comment bigram is in bigram model
    frequency = [] # if the bigram can be found in the model; 1 if frequent, 0 if infrequent; places them in list
    split_comment_bigrams = [] # separates each bigram into strings
    
    for bigram in comment_bigrams:
        if bigram_model['bigrams'].str.contains(bigram).any():
            frequency.append(1)
        else: 
            frequency.append(0)
        split_comment_bigrams.append(bigram.split())
      
    
    #scores words in bigram        
    word_score = [0] * (len(frequency)+1)
    
    for i in range(len(frequency) - 1):
        if frequency[i] == 1 and frequency[i+1] == 1:
            word_score[i] = word_score[i] + 2
            word_score[i+1] = word_score[i+1] + 2
            word_score[i+2] = word_score[i+2] + 2
        elif frequency[i] == 1 and frequency[i+1] == 0:
            word_score[i] = word_score[i] + 1
            word_score[i+1] = word_score[i+1] + 1
            word_score[i+2] = word_score[i+2] - 1
        elif frequency[i] == 0 and frequency[i+1] == 1:
            word_score[i] = word_score[i] - 1 # -1 dito pero -2 sa model 2 sa infrequent part
            word_score[i+1] = word_score[i+1] + 1
            word_score[i+2] = word_score[i+2] + 1
        else:
            word_score[i] = word_score[i] - 1
            word_score[i+1] = word_score[i+1] - 1
            word_score[i+2] = word_score[i+2] - 1  
                 

    #scores unigrams
    comment_unigrams = comment.split()
    
    for x in range(len(word_score)): #goes through the word_scores
        if word_score[x] <= 0: 
            if unigram_model['unigrams'].str.contains(comment_unigrams[x]).any():
                word_score[x] = 1
            else:
                word_score[x] = 0
    
    en_words = []
    tl_words = []
    
    for x in range(len(word_score)):
        if word_score[x] > 0: #if word is english
            en_words.append(comment_unigrams[x])
        elif word_score[x] == 0:
            tl_words.append(comment_unigrams[x])
            

    #returns word list for each language
    if language == 'en':
        return en_words
    elif language == 'tl':
        return tl_words

Saves English words

In [15]:
tlen_comments_train['En_words'] = tlen_comments_train.apply(lambda x: bigram_unigram_scoring(x['Bigrams'], x['Comment'], 'en'), axis=1)
tlen_comments_test['En_words'] = tlen_comments_test.apply(lambda x: bigram_unigram_scoring(x['Bigrams'], x['Comment'], 'en'), axis=1)

Saves Filipino words

In [16]:
tlen_comments_train['Tl_words'] = tlen_comments_train.apply(lambda x: bigram_unigram_scoring(x['Bigrams'], x['Comment'], 'tl'), axis=1)
tlen_comments_test['Tl_words'] = tlen_comments_test.apply(lambda x: bigram_unigram_scoring(x['Bigrams'], x['Comment'], 'tl'), axis=1)
tlen_comments_train

Unnamed: 0,Comment,Language,Sentiment,Bigrams,En_words,Tl_words
110,favorite line competitive guys,tlen,1,"[favorite line, line competitive, competitive ...","[favorite, line, competitive, guys]",[]
129,puro freecut hehehe saktong freecut lang,tlen,-1,"[puro freecut, freecut hehehe, hehehe saktong,...",[lang],"[puro, freecut, hehehe, saktong, freecut]"
131,wala akong tiwala opinion not supported claims...,tlen,-1,"[wala akong, akong tiwala, tiwala opinion, opi...","[opinion, not, supported, claims, studies]","[wala, akong, tiwala]"
133,knowledgeable maraming visual aids slides,tlen,1,"[knowledgeable maraming, maraming visual, visu...","[visual, aids, slides]","[knowledgeable, maraming]"
153,mahilig phrases play ear word cascade h1a2p3p4y5,tlen,0,"[mahilig phrases, phrases play, play ear, ear ...","[phrases, play, ear, word]","[mahilig, cascade, h1a2p3p4y5]"
...,...,...,...,...,...,...
660,malimit siyang magtanong bagsak recitation,tlen,-1,"[malimit siyang, siyang magtanong, magtanong b...",[],"[malimit, siyang, magtanong, bagsak, recitation]"
661,tas quiz multiple choice kaso wala projector i...,tlen,-1,"[tas quiz, quiz multiple, multiple choice, cho...","[tas, quiz, multiple, choice, projector, ident...","[kaso, wala]"
662,lagi sineset mood classroom ambiance,tlen,1,"[lagi sineset, sineset mood, mood classroom, c...","[mood, classroom]","[lagi, sineset, ambiance]"
663,malabo magbigay requirements,tlen,-1,"[malabo magbigay, magbigay requirements]",[requirements],"[malabo, magbigay]"


In [28]:
tlen_comments_train.to_csv('tlen_comments_cspd_output_train.csv', index=False)

In [29]:
tlen_comments_test.to_csv('tlen_comments_cspd_output_test.csv', index=False)