<a href="https://colab.research.google.com/github/hannaharham/Sentiment-Analysis-SDG-UM/blob/main/FYP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1) Data Preparation

##1.1 : Data Integration : Merge both datasets

In [1]:
import pandas as pd

In [2]:
# Load Instagram OCR data
ig_df = pd.read_csv("instagram_confessions_ocr.csv")

# Load Google Form responses
survey_df = pd.read_csv("UM Student Responses - Form Responses 1.csv")

In [3]:
#inspect columns
print(ig_df.columns)
print(survey_df.columns)

Index(['image_name', 'raw_text'], dtype='object')
Index(['Timestamp', 'Which faculty are you from ',
       'What is your current year of study?',
       'Have you heard of the Sustainable Development Goals (SDGs) before?  ',
       'How would you rate your awareness of sustainability and SDGs? ',
       'Have you ever participated in any sustainability-related event/program at UM? ',
       'If yes, which type of event?  ',
       'How do you feel about UM's sustainability efforts on campus? (Good? Bad?  Enough/Not Enough? Why?) ',
       'Have you joined any SUSTAINABILITY ACTIVITY/INITIATIVES in UM? How was your experience? (eg : recycling programs, awareness campaigns, energy-saving practices, or any SDG-related events)',
       'What do you think UM is doing WELL in terms of sustainability and SDG efforts?  (strengths)',
       'What areas do you think UM needs to IMPROVE to become a more sustainable campus?  ',
       'Do you feel sustainability and SDG elements are incorporated 

In [4]:
#prepare ig dataset
ig_clean = ig_df[["raw_text"]].copy()
ig_clean.rename(columns={"raw_text": "text"}, inplace=True)
ig_clean["source"] = "Instagram"
ig_clean.head()

Unnamed: 0,text,source
0,#UM019783\n\nUm bila nak buat esport centre. T...,Instagram
1,#UM019634\nAfter a few months aku kenal sorang...,Instagram
2,#UM019766\nU deserve someone better than him t...,Instagram
3,"#UM019787\n\nSaje jer nak cakap, fakulti medic...",Instagram
4,"#UM019748\n\nCeyy terpaling tegur orang lain, ...",Instagram


In [7]:
#check column names in survey csv
print(survey_df.columns.tolist())

['Timestamp', 'Which faculty are you from ', 'What is your current year of study?', 'Have you heard of the Sustainable Development Goals (SDGs) before?  ', 'How would you rate your awareness of sustainability and SDGs? ', 'Have you ever participated in any sustainability-related event/program at UM? ', 'If yes, which type of event?  ', "How do you feel about UM's sustainability efforts on campus? (Good? Bad?  Enough/Not Enough? Why?) ", 'Have you joined any SUSTAINABILITY ACTIVITY/INITIATIVES in UM? How was your experience? (eg : recycling programs, awareness campaigns, energy-saving practices, or any SDG-related events)', 'What do you think UM is doing WELL in terms of sustainability and SDG efforts?  (strengths)', 'What areas do you think UM needs to IMPROVE to become a more sustainable campus?  ', 'Do you feel sustainability and SDG elements are incorporated into your LEARNING EXPERIENCE (e.g., lectures, coursework, activities)? Why or why not?  ', 'Any additional comments/suggestio

In [8]:
#Standardize column names (removes hidden spaces/newlines)
survey_df.columns = (
    survey_df.columns
    .astype(str)
    .str.replace(r"\s+", " ", regex=True)   # collapse weird whitespace/newlines
    .str.strip()                             # trim leading/trailing spaces
)

In [9]:
#recheck
print(survey_df.columns.tolist())

['Timestamp', 'Which faculty are you from', 'What is your current year of study?', 'Have you heard of the Sustainable Development Goals (SDGs) before?', 'How would you rate your awareness of sustainability and SDGs?', 'Have you ever participated in any sustainability-related event/program at UM?', 'If yes, which type of event?', "How do you feel about UM's sustainability efforts on campus? (Good? Bad? Enough/Not Enough? Why?)", 'Have you joined any SUSTAINABILITY ACTIVITY/INITIATIVES in UM? How was your experience? (eg : recycling programs, awareness campaigns, energy-saving practices, or any SDG-related events)', 'What do you think UM is doing WELL in terms of sustainability and SDG efforts? (strengths)', 'What areas do you think UM needs to IMPROVE to become a more sustainable campus?', 'Do you feel sustainability and SDG elements are incorporated into your LEARNING EXPERIENCE (e.g., lectures, coursework, activities)? Why or why not?', 'Any additional comments/suggestions regarding s

In [10]:
#prepare survey dataset
open_ended_cols = [
    "If yes, which type of event?",
    "How do you feel about UM's sustainability efforts on campus? (Good? Bad? Enough/Not Enough? Why?)",
    "Have you joined any SUSTAINABILITY ACTIVITY/INITIATIVES in UM? How was your experience? (eg : recycling programs, awareness campaigns, energy-saving practices, or any SDG-related events)",
    "What do you think UM is doing WELL in terms of sustainability and SDG efforts? (strengths)",
    "What areas do you think UM needs to IMPROVE to become a more sustainable campus?",
    "Do you feel sustainability and SDG elements are incorporated into your LEARNING EXPERIENCE (e.g., lectures, coursework, activities)? Why or why not?",
    "Any additional comments/suggestions regarding sustainability at UM?"
]


In [11]:
survey_long = survey_df[open_ended_cols] \
    .melt(value_name="text") \
    .dropna(subset=["text"])

survey_long["source"] = "Survey"

In [13]:
#merge survey + instagram
master_df = pd.concat([ig_clean, survey_long], ignore_index=True)

Unnamed: 0,text,source,variable
0,#UM019783\n\nUm bila nak buat esport centre. T...,Instagram,
1,#UM019634\nAfter a few months aku kenal sorang...,Instagram,
2,#UM019766\nU deserve someone better than him t...,Instagram,
3,"#UM019787\n\nSaje jer nak cakap, fakulti medic...",Instagram,
4,"#UM019748\n\nCeyy terpaling tegur orang lain, ...",Instagram,


In [15]:
#remove variable column
master_df = master_df.drop(columns=["variable"], errors="ignore")
master_df.head()

Unnamed: 0,text,source
0,#UM019783\n\nUm bila nak buat esport centre. T...,Instagram
1,#UM019634\nAfter a few months aku kenal sorang...,Instagram
2,#UM019766\nU deserve someone better than him t...,Instagram
3,"#UM019787\n\nSaje jer nak cakap, fakulti medic...",Instagram
4,"#UM019748\n\nCeyy terpaling tegur orang lain, ...",Instagram


##1.2 : Data Cleaning

In [43]:
#confirm structure
master_df.info()
master_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 357 entries, 0 to 440
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   text                 357 non-null    object
 1   source               357 non-null    object
 2   matched_keywords     357 non-null    object
 3   is_relevant          357 non-null    bool  
 4   is_relevant_refined  357 non-null    bool  
dtypes: bool(2), object(3)
memory usage: 11.9+ KB


Unnamed: 0,text,source,matched_keywords,is_relevant,is_relevant_refined
0,Um bila nak buat esport centre. Tengok ukm pun...,Instagram,[support],True,True
1,After a few months aku kenal sorang lelaki ni ...,Instagram,[],False,False
2,"U deserve someone better than him tbh, yes you...",Instagram,[kk*],True,False
3,"Saje jer nak cakap, fakulti medic tu ada cours...",Instagram,"[course, fakulti, it]",True,True
4,"Ceyy terpaling tegur orang lain, lahh rupanya ...",Instagram,[],False,False


In [44]:
#Drop empty & useless rows
# Remove rows where text is NaN
master_df = master_df.dropna(subset=["text"])

# Remove rows that are only spaces or very short
master_df["text"] = master_df["text"].astype(str).str.strip()
master_df = master_df[master_df["text"].str.len() > 10]

print("Rows after removing empty/junk:", len(master_df))

Rows after removing empty/junk: 357


In [45]:
#Remove OCR artefacts (hashtags, line breaks)
import re

def clean_basic(text):
    text = re.sub(r"#UM\d+", "", text)      # remove UM hashtags
    text = re.sub(r"\n+", " ", text)        # remove line breaks
    text = re.sub(r"\s+", " ", text)        # normalize spaces
    return text.strip()

master_df["text"] = master_df["text"].apply(clean_basic)
master_df.head()

Unnamed: 0,text,source,matched_keywords,is_relevant,is_relevant_refined
0,Um bila nak buat esport centre. Tengok ukm pun...,Instagram,[support],True,True
1,After a few months aku kenal sorang lelaki ni ...,Instagram,[],False,False
2,"U deserve someone better than him tbh, yes you...",Instagram,[kk*],True,False
3,"Saje jer nak cakap, fakulti medic tu ada cours...",Instagram,"[course, fakulti, it]",True,True
4,"Ceyy terpaling tegur orang lain, lahh rupanya ...",Instagram,[],False,False


In [46]:
#Filter for sustainability relevance
#Keyword-based relevance filter

keywords = [
    # Environment / waste / cleanliness
    "sustainability","sustainable","sdg","green","environment",
    "sampah","trash","rubbish","litter","waste",
    "bersih","kotor","dirty","clean",
    "bau","busuk","hapak","smell",
    "recycle","recycling","recyclable",
    "paper","plastic","bins","tong",
    "zero waste","campaign",
    "trees","planting","energy","water","electricity",
    "flood","cuaca","panas","weather",

    # Campus & facilities
    "campus","kolej","hostel","tandas","toilet",
    "parking","bus","transport","facility","facilities","maintenance","management",
    "wifi","internet","it","teknikal","website","system","maya",
    "library","cafe","kafe","bonggol",

    # Education / learning
    "education","learning","class","course","assignment","exam","result",
    "faculty","fac","fakulti","online","on9","hybrid","odl","register","registration",
    "staf","staff","lecturer",

    # Health / wellbeing / safety
    "health","healthy","stress","stres","ppum","klinik",
    "merokok","rokok","vape","smoking","harassment","safety","mental","wellbeing",

    # Cost / access
    "mahal","murah","harga","yuran","fees","zero balance policy","poverty","injustice",
    "part-time","job","work","sapu","rider",

    # Awareness / engagement
    "awareness","efforts","improve","support","initiative",
    "program","programme","event","forum","talk","volunteer","sukarelawan","project","projek",
]


In [47]:
# Define exclusion keywords
exclude_keywords = [
    "boyfriend","girlfriend","bf","gf",
    "ex","crush","situationship","hts",
    "dating","broke up","breakup","dump",
    "aku suka","aku cinta","cinta","suka",
    "rindu","sayang",
    "curang","cheat","selingkuh",
    "couple","relationship",
    "hormone","perasaan"
]

In [48]:
#define anchor keywords
anchor_keywords = [
    "sdg","sustainability","sustainable","environment",
    "recycle","recycling","waste","zero waste","green",
    "sampah","litter","plastic","paper","bin","bins",
    "energy","water","flood","banjir"
]

In [49]:
#Special patterns (handles KK10, kk12, KK8 etc.)
kk_pattern = re.compile(r"\bkk\d*\b", re.IGNORECASE)

In [50]:
#record which keywords matched each row
def matched_keywords(text: str):
    t = str(text).lower()
    matches = [k for k in keywords if k in t]
    if kk_pattern.search(text):
        matches.append("kk*")
    return sorted(set(matches))

master_df["matched_keywords"] = master_df["text"].apply(matched_keywords)

In [51]:
#final relevance logic
def is_relevant_refined(text: str) -> bool:
    t = str(text).lower()

    has_inclusion = any(k in t for k in keywords) or (kk_pattern.search(text) is not None)
    has_anchor = any(a in t for a in anchor_keywords)
    has_exclusion = any(e in t for e in exclude_keywords)

    # If it has strong sustainability anchor â†’ keep
    if has_anchor:
        return True

    # Otherwise apply normal rule
    return bool(has_inclusion and not has_exclusion)

In [52]:
master_df["is_relevant_refined"] = master_df["text"].apply(is_relevant_refined)
print(master_df["is_relevant_refined"].value_counts())

is_relevant_refined
True     249
False    108
Name: count, dtype: int64


In [53]:
#check
print("\nSample TRUE (kept):")
display(master_df[master_df["is_relevant_refined"] == True][["source","text","matched_keywords"]].sample(10, random_state=42))


Sample TRUE (kept):


Unnamed: 0,source,text,matched_keywords
237,Survey,Environment day,[environment]
9,Instagram,"Heads up, mungkin UM boleh buat sistem maya ni...","[it, maya]"
166,Instagram,"Kalau kita fikir secara rasional, tak tercabar...",[it]
105,Instagram,#UMO019683 Bila masih ada segelintir students ...,"[it, lecturer, online]"
193,Instagram,"Dengan identiti yg dirahsiakan atau pon ""anony...",[it]
334,Survey,UM is active in campus sustainability initiati...,"[campus, initiative, it, sustainability]"
358,Survey,Lots of awareness campaigns,"[awareness, campaign]"
338,Survey,They are improving the policies such as bannin...,[plastic]
15,Instagram,hi mein boleh suggest kerja part time area um-...,[rider]
183,Instagram,"frenster memang nak hypekan tahun ni punya, bu...","[it, program]"


In [54]:
#check
print("\nSample FALSE (removed):")
display(master_df[master_df["is_relevant_refined"] == False][["source","text"]].sample(10, random_state=42))


Sample FALSE (removed):


Unnamed: 0,source,text
196,Instagram,pernah â€˜kawanâ€™ sekali dgn diorang ni. tapi act...
22,Instagram,Kepada abang yang jadi hantu kat seruan. Saya ...
11,Instagram,re â€” os * a ih
210,Instagram,Stoplah backup kroni korang tu bila nama naik ...
157,Instagram,"UM, uni yang patutnya gah dan membentuk pelaja..."
170,Instagram,Get the chance to talk with this humble guy. E...
85,Instagram,"Shame on Anak-Anak Sarawak, benda free baru be..."
115,Instagram,Result exam kluar bila ya? Sbb daftar subjek p...
292,Survey,Not promoted enough and does not sound interes...
30,Instagram,"masuk sem baru ni, moga dijauhkan daripada fre..."
