In [None]:
# This command installs a Python package directly from a GitHub repository.
# It uses pip to install the Tagalog Stemmer package from the repository hosted at the given URL.
!pip install git+https://github.com/andrianllmm/tagalog-stemmer.git@main
!pip install langdetect
!pip install googletrans==4.0.0-rc1


Collecting git+https://github.com/andrianllmm/tagalog-stemmer.git@main
  Cloning https://github.com/andrianllmm/tagalog-stemmer.git (to revision main) to /tmp/pip-req-build-mbm9ws0n
  Running command git clone --filter=blob:none --quiet https://github.com/andrianllmm/tagalog-stemmer.git /tmp/pip-req-build-mbm9ws0n
  Resolved https://github.com/andrianllmm/tagalog-stemmer.git to commit b5babfd4caebf8a8f480f8adab9f1c97f42a3baa
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: tglstemmer
  Building wheel for tglstemmer (pyproject.toml) ... [?25l[?25hdone
  Created wheel for tglstemmer: filename=tglstemmer-0.0.1-py3-none-any.whl size=146663 sha256=2407416b9856af2477a88f468adfcd879f6bac9c64ad3d898d883fe1105f2a3e
  Stored in directory: /tmp/pip-ephem-wheel-cache-pd19p0_j/wheels/8b/a1/58/0ac74f560df0e1894833f3d6d2b6efcf76b8cf603ebd8

In [None]:
# Importing necessary libraries
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis

# Importing visualization libraries
import matplotlib.pyplot as plt  # For plotting and data visualization
from matplotlib import style  # For setting plot styles
import seaborn as sns  # For advanced data visualization (heatmaps, categorical plots)

# Importing Natural Language Toolkit (nltk) for NLP operations
import nltk
nltk.download('punkt')  # Downloading 'punkt' tokenizer for sentence and word tokenization

# Importing functions from nltk for stopwords and tokenization
from nltk.corpus import stopwords  # For using stopwords (commonly removed words like 'and', 'the')
from nltk.tokenize import word_tokenize, sent_tokenize  # Functions for word and sentence tokenization

# Importing stemmers (for reducing words to their root form)
from nltk.stem import PorterStemmer, LancasterStemmer  # Porter and Lancaster stemming algorithms

# Importing WordNet lemmatizer (for converting words to their base form using linguistic rules)
from nltk.stem import WordNetLemmatizer

# Importing Tagalog stemmer from the 'tglstemmer' package (assumed installed in earlier command)
from tglstemmer import stemmer  # Tagalog language stemmer

# Importing vectorizers from scikit-learn for creating document-term matrices (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer  # TF-IDF and Count vectorization

#Downloading 'stopwords'
nltk.download('stopwords')

# Defining a set of stop words for use in text processing
stop_words = set(nltk.corpus.stopwords.words('english'))  # Stop words in English

# Downloading 'wordnet' corpus for lemmatization
nltk.download('wordnet')

# Importing regular expression library for text manipulation and pattern matching
import re  # For regex operations such as cleaning text data

from langdetect import detect

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Reading Tagalog stopwords from a text file
# 'tagalog_stop_words.txt' is assumed to be a file containing a list of stopwords in Tagalog.
# The file is read as a DataFrame using pandas.
tagalog_stopwords = pd.read_csv("tagalog_stop_words.txt")

# Extracting the 'stopwords' column from the DataFrame and converting it to a Python list
# Assuming the file has a column named 'stopwords' that contains the actual stop words.
tagalog_stopwords = tagalog_stopwords['stopwords'].tolist()

# Output the list of Tagalog stopwords
tagalog_stopwords


['ako',
 'sa',
 'akin',
 'ko',
 'aking',
 'sarili',
 'kami',
 'atin',
 'ang',
 'aming',
 'amin',
 'ating',
 'ka',
 'iyong',
 'iyo',
 'inyong',
 'siya',
 'kanya',
 'mismo',
 'ito',
 'nito',
 'kanyang',
 'sila',
 'nila',
 'kanila',
 'kanilang',
 'kung',
 'ano',
 'alin',
 'sino',
 'kanino',
 'na',
 'mga',
 'iyon',
 'am',
 'ay',
 'maging',
 'naging',
 'mayroon',
 'may',
 'nagkaroon',
 'pagkakaroon',
 'gumawa',
 'ginagawa',
 'ginawa',
 'paggawa',
 'ibig',
 'dapat',
 'maaari',
 'marapat',
 'kong',
 'ikaw',
 'tayo',
 'hindi',
 'namin',
 'gusto',
 'nais',
 'niyang',
 'nilang',
 'niya',
 'huwag',
 'ginawang',
 'gagawin',
 'maaaring',
 'sabihin',
 'narito',
 'kapag',
 'ni',
 'nasaan',
 'bakit',
 'paano',
 'kailangan',
 'walang',
 'katiyakan',
 'isang',
 'at',
 'pero',
 'o',
 'dahil',
 'bilang',
 'hanggang',
 'habang',
 'ng',
 'pamamagitan',
 'para',
 'tungkol',
 'laban',
 'pagitan',
 'panahon',
 'bago',
 'pagkatapos',
 'itaas',
 'ibaba',
 'mula',
 'pataas',
 'pababa',
 'palabas',
 'ibabaw',
 'il

In [None]:
# Combining English and Tagalog stopwords into a single list
# The stopwords from nltk (in English) are combined with the Tagalog stopwords previously loaded from a file.
# This allows you to remove both English and Tagalog stopwords from text in one step.
all_stopwords = stopwords.words('english') + tagalog_stopwords

# The resulting 'all_stopwords' will contain stopwords from both languages.


In [None]:
# Reading the English dataset from a CSV file
# 'AI in edu dataset - Sheet1.csv' is assumed to contain text data in English.
# The dataset is loaded as a pandas DataFrame.
dataset = pd.read_csv('AI in edu dataset - Sheet1.csv')

dataset = dataset.drop(dataset[dataset['Sentiment'] == 'Neutral' ].index)
dataset = dataset.drop(dataset[dataset['Sentiment'] == 'neutral' ].index)

dataset.dropna(subset = ['Sentiment'], inplace=True)

# Converting all data in the dataset to string type
# Ensures that all values in the DataFrame are treated as text, regardless of their original format.
dataset = dataset.astype(str)

In [None]:
dataset['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Positive,208
Negative,162


In [None]:
import random
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')
# Function to replace words with synonyms
def synonym_replacement(sentence, n=2):
    words = sentence.split()
    new_sentence = words.copy()

    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))  # Only consider words that have synonyms

    # Randomly select 'n' words to replace with their synonyms
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if len(synonyms) > 0:
            synonym = synonyms[0].lemmas()[0].name()  # Choose the first synonym
            new_sentence = [synonym if word == random_word else word for word in new_sentence]
            num_replaced += 1
        if num_replaced >= n:
            break

    print('Done SR')
    return ' '.join(new_sentence)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from googletrans import Translator

# Function for back-translation
def back_translate(sentence, lang='fr'):
    translator = Translator()

    # Translate the sentence to the target language
    translated = translator.translate(sentence, src='en', dest=lang).text

    # Translate it back to English
    back_translated = translator.translate(translated, src=lang, dest='en').text

    print('Done Translate')
    return back_translated


In [None]:
# Create a copy of the original dataset
augmented_dataset = dataset.copy()

# Apply synonym replacement to each row and append to the 'Content' column
augmented_dataset['Content_Augmented_Synonym'] = augmented_dataset['Content'].apply(lambda x: synonym_replacement(x, n=1))

# Apply back-translation and append to the 'Content' column
augmented_dataset['Content_Augmented_BackTranslate'] = augmented_dataset['Content'].apply(lambda x: back_translate(x, lang='fr'))


Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR
Done SR


In [None]:
df1 = augmented_dataset[['Content_Augmented_Synonym','Sentiment']]

df2 = augmented_dataset[['Content_Augmented_BackTranslate','Sentiment']]
df1.rename(columns = {'Content_Augmented_Synonym':'Content'}, inplace = True)
df2.rename(columns = {'Content_Augmented_BackTranslate':'Content'}, inplace = True)

dataset = pd.concat([df1, df2, dataset])
dataset

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.rename(columns = {'Content_Augmented_Synonym':'Content'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.rename(columns = {'Content_Augmented_BackTranslate':'Content'}, inplace = True)


Unnamed: 0,Content,Sentiment
0,The education system that AI will destroy will...,Negative
1,AI tool sare very helpful but you need to use ...,Positive
2,AI and teachers should join forces in angstrom...,Positive
3,Kids that learn how to use the AI will realize...,Negative
4,"But here’s the thing, if we embrace the AI, ho...",Negative
...,...,...
413,"AI be useful, pero dapat may balance sa paggam...",Negative
414,"Hindi pa tayo masyadong advanced sa Pilipinas,...",Negative
415,Kailangan lang sodium maayos ang paggamit ng A...,Negative
429,AI can be a powerful tool to enhance learning ...,Negative


In [None]:
dataset = pd.concat([df1, df2, dataset])
dataset

Unnamed: 0,Content,Sentiment,Platform
0,The education system that AI will destroy will...,Negative,
1,AI tool sare very helpful but you need to use ...,Positive,
2,AI and teachers should join forces in angstrom...,Positive,
3,Kids that learn how to use the AI will realize...,Negative,
4,"But here’s the thing, if we embrace the AI, ho...",Negative,
...,...,...,...
413,"AI is useful, pero dapat may balance sa paggam...",Negative,
414,"Hindi pa tayo masyadong advanced sa Pilipinas,...",Negative,
415,Kailangan lang na maayos ang paggamit ng AI pa...,Negative,
429,AI can be a powerful tool to enhance learning ...,Negative,


In [None]:
dataset.to_csv('augmented.csv',index=0) ##WE WILL SAVE AUGMENTED DATASET IN OUR LOCAL STORAGE TO MAKE REPLICATION OR TESTING EASIER IN THE FUTURE SO THAT AUGMENTATION WILL NOT BE NEEDED TO RUN AGAIN

In [None]:
dataset.count()

Unnamed: 0,0
Content,1110
Sentiment,1110
Platform,370


In [None]:
def clean_text(text):

    #Determine the language of the sentence
    lang = detect(text)


    # Remove any character that is not a letter or whitespace using regex
    # This step removes punctuation, numbers, and special characters, leaving only letters and spaces.
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Initialize the WordNet Lemmatizer (for reducing words to their base form)
    le = WordNetLemmatizer()

    # Tokenize the text into individual words
    word_tokens = word_tokenize(text)

    # Lemmatize each word token and remove stopwords (words in 'all_stopwords') and tokens shorter than 3 characters
    tokens = [le.lemmatize(w) for w in word_tokens if w not in all_stopwords and len(w) > 3]

    # Stemming based on language
    if lang == 'en':
        # For other languages (assumed to be English), use the PorterStemmer for stemming
        ps = PorterStemmer()
        stemmed_tokens = [ps.stem(token) for token in tokens]

    else:
        # If the language is English, use the Tagalog stemmer ('stemmer' assumed to be the English stemmer)
        stemmed_tokens = [stemmer.get_stem(token) for token in tokens]

    # Join the processed tokens back into a single string
    cleaned_text = " ".join(stemmed_tokens)

    # Return the cleaned and processed text
    return cleaned_text


In [None]:
# Applying the 'clean_text' function to the 'Content' column of the dataset
# For each row in the 'Content' column, it calls the 'clean_text' function, specifying "english" as the language.
dataset['Content'] = dataset['Content'].apply(lambda x: clean_text(x))


In [None]:
# Initializing the TfidfVectorizer
# This vectorizer converts the text into a matrix of TF-IDF features.
# 'stop_words=all_stopwords' specifies that the TF-IDF vectorizer should ignore both English and Tagalog stopwords.
# 'max_features=1000' limits the number of features (terms) to 1000, keeping only the most important ones based on TF-IDF scores.
vect = TfidfVectorizer(stop_words=all_stopwords, max_features=1000)

# Applying the vectorizer to the 'Content' column of the English dataset
# The fit_transform method learns the vocabulary and creates the document-term matrix (DTM) based on TF-IDF scores.
vect_text = vect.fit_transform(dataset['Content'])


In [None]:
# Printing the shape of the TF-IDF matrix
# This will show the dimensions of the matrix, where the first number is the number of documents and the second number is the number of features (terms).
print(vect_text.shape)

# Printing the TF-IDF matrix
# This will display the sparse matrix representation of the TF-IDF features.
# Each row corresponds to a document, and each column corresponds to a term, with values representing the TF-IDF scores.
print(vect_text)


(1110, 1000)
  (0, 991)	0.3514417421535532
  (0, 47)	0.40093425990231
  (0, 140)	0.33681888659415954
  (0, 13)	0.31056237756196853
  (0, 230)	0.41526267129221534
  (0, 897)	0.5526392159704392
  (0, 247)	0.16730594486754566
  (1, 939)	0.3191882648887706
  (1, 874)	0.2930710561317415
  (1, 102)	0.33466371656565197
  (1, 382)	0.43096010153929615
  (1, 788)	0.26477061942498026
  (1, 350)	0.26477061942498026
  (1, 737)	0.2853731479103198
  (1, 409)	0.3263713979783511
  (1, 592)	0.36139130735985536
  (1, 370)	0.17046267447413638
  (1, 935)	0.1633325139055952
  (2, 111)	0.3210613031038277
  (2, 883)	0.1296342118111935
  (2, 255)	0.37605929251133546
  (2, 800)	0.33309573562451394
  (2, 251)	0.3248190178593697
  (2, 155)	0.32881932905164113
  (2, 318)	0.3856140619566219
  :	:
  (1106, 678)	0.6873471346327641
  (1107, 84)	0.5330813107041159
  (1107, 387)	0.5626451705640386
  (1107, 952)	0.3891941441390911
  (1107, 336)	0.303934430503286
  (1107, 902)	0.39421632175929705
  (1108, 260)	0.491591444

In [None]:
##Text Classification MODEL COMPARISON

In [None]:
#Splitting of dataset
X_data, X_unseen, y_data, y_unseen = train_test_split(vect_text, dataset['Sentiment'], test_size=0.1, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=0)

In [None]:
#Training and evaluation of Model (Decision Tree)
from sklearn import tree
tree = tree.DecisionTreeClassifier()
tree.fit(X_train,y_train)

y_pred = tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_pred_unseen = tree.predict(X_unseen)
accuracy_unseen = accuracy_score(y_unseen,y_pred_unseen)
print(f"Test Accuracy: {accuracy}")
print(f"Unseen Accuracy: {accuracy_unseen}")
print(classification_report(y_test, y_pred))
print(classification_report(y_unseen, y_pred_unseen))

Test Accuracy: 0.905
Unseen Accuracy: 0.8918918918918919
              precision    recall  f1-score   support

    Negative       0.93      0.86      0.89        92
    Positive       0.89      0.94      0.91       108

    accuracy                           0.91       200
   macro avg       0.91      0.90      0.90       200
weighted avg       0.91      0.91      0.90       200

              precision    recall  f1-score   support

    Negative       0.93      0.83      0.88        52
    Positive       0.86      0.95      0.90        59

    accuracy                           0.89       111
   macro avg       0.90      0.89      0.89       111
weighted avg       0.90      0.89      0.89       111



In [None]:
#Training and evaluation of Model (Logistic Regression)
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression().fit(X_train,y_train)

y_pred = LR.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_pred_unseen = LR.predict(X_unseen)
accuracy_unseen = accuracy_score(y_unseen,y_pred_unseen)
print(f"Test Accuracy: {accuracy}")
print(f"Unseen Accuracy: {accuracy_unseen}")
print(classification_report(y_test, y_pred))
print(classification_report(y_unseen, y_pred_unseen))

Test Accuracy: 0.925
Unseen Accuracy: 0.8648648648648649
              precision    recall  f1-score   support

    Negative       0.94      0.89      0.92        92
    Positive       0.91      0.95      0.93       108

    accuracy                           0.93       200
   macro avg       0.93      0.92      0.92       200
weighted avg       0.93      0.93      0.92       200

              precision    recall  f1-score   support

    Negative       0.85      0.87      0.86        52
    Positive       0.88      0.86      0.87        59

    accuracy                           0.86       111
   macro avg       0.86      0.86      0.86       111
weighted avg       0.87      0.86      0.86       111



In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

y_pred = rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_pred_unseen = rfc.predict(X_unseen)
accuracy_unseen = accuracy_score(y_unseen,y_pred_unseen)
print(f"Test Accuracy: {accuracy}")
print(f"Unseen Accuracy: {accuracy_unseen}")
print(classification_report(y_test, y_pred))
print(classification_report(y_unseen, y_pred_unseen))

Test Accuracy: 0.94
Unseen Accuracy: 0.918918918918919
              precision    recall  f1-score   support

    Negative       0.97      0.90      0.93        92
    Positive       0.92      0.97      0.95       108

    accuracy                           0.94       200
   macro avg       0.94      0.94      0.94       200
weighted avg       0.94      0.94      0.94       200

              precision    recall  f1-score   support

    Negative       0.91      0.92      0.91        52
    Positive       0.93      0.92      0.92        59

    accuracy                           0.92       111
   macro avg       0.92      0.92      0.92       111
weighted avg       0.92      0.92      0.92       111

