# Instagram Sentiment Analysis POC - Transformer pretrained

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
from transformers import pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import plotly.express as px
import torch
import time

In [None]:
#Tokenizer setup
nltk.download('stopwords')
nltk.download('punkt')
DetectorFactory.seed = 0

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shyam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shyam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Load

In [None]:
#Dataset taken from https://github.com/luminati-io/Social-media-dataset-samples
df = pd.read_csv(r"C:\Users\shyam\Downloads\Project_NLP_Insta\Instagram-datasets.csv")
df = df[['comment', 'comment_date', 'likes_number', 'post_url']].copy()
df.dropna(subset=['comment'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,comment,comment_date,likes_number,post_url
0,👏👏👏,2024-11-13T20:01:57.000Z,1,https://www.instagram.com/p/DCUcCAHvEli
1,😍😍😍,2024-11-13T17:11:39.000Z,1,https://www.instagram.com/p/DCUcCAHvEli
2,Meu querido @euwanderson7 que foi nosso dele...,2024-11-13T23:00:46.000Z,1,https://www.instagram.com/p/DCT93_2p7bG
3,"Com o professor Matheus Carvalho, realmente s...",2024-11-14T00:16:19.000Z,3,https://www.instagram.com/p/DCUhYQ4Pn2y
4,@rafael.fonteles todo ano é isso. O único muni...,2024-11-14T17:33:11.000Z,1,https://www.instagram.com/p/DCRWettOTUB


## Preprocessing

In [None]:
#Cleaning Data Functions
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|@\w+|#\w+|[^\x00-\x7F]+", " ", text)  # Removing url using regex on HTTP, @ regex mentions, and emoji code regex patterns
    text = re.sub(r"[^a-z\s]", "", text)                          # Letter Only Regex
    text = re.sub(r"\s+", " ", text).strip()                     # Space Trim Logic
    return text

#English Language Detection
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"
    
#Tokenize our comments and also flag the records that are english
df['clean_comment'] = df['comment'].apply(clean_text)
df['language'] = df['clean_comment'].apply(detect_language)
df = df[df['language'] == 'en'].copy()
df['token_count'] = df['clean_comment'].apply(lambda x: len(word_tokenize(x)))
print("English-only comments retained:", len(df))
df.head()

English-only comments retained: 318


Unnamed: 0,comment,comment_date,likes_number,post_url,clean_comment,language,token_count
33,I have never peed in a race car too,2024-11-04T14:01:53.000Z,4,https://www.instagram.com/reel/DB4HUISvG3V,i have never peed in a race car too,en,9
46,A The Best de todo o litoral 👏🏻📸☀️🌴🌊🌅✨ @raiane...,2024-10-27T18:15:39.000Z,2,https://www.instagram.com/p/DBovSOkvRTd,a the best de todo o litoral,en,7
48,Do you sign up at the show if you want to be a...,2024-11-01T18:29:38.000Z,1,https://www.instagram.com/p/DBY88T2p-vx,do you sign up at the show if you want to be a...,en,14
49,You guys forgot the apron in the women section...,2024-11-16T14:06:28.000Z,3,https://www.instagram.com/p/DCW8m-GtW4B,you guys forgot the apron in the women section...,en,15
70,How do we sign up if we're interested in perfo...,2024-10-27T08:50:48.000Z,2,https://www.instagram.com/p/DBY88T2p-vx,how do we sign up if were interested in perfor...,en,10


In [None]:
#Used the Hugging Face DistilBERT Transformer to classify my text
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

Device set to use cpu


In [None]:
#Batching my data and Adding Confidence and Sentiments
sentiments = []
confidences = []
batch_size = 32
total = len(df)

print("Running sentiment predictions...")

for i in range(0, total, batch_size):
    batch = df['clean_comment'].iloc[i:i + batch_size].tolist()
    preds = sentiment_model(batch)

    sentiments.extend([1 if p['label'] == 'POSITIVE' else 0 for p in preds])
    confidences.extend([p['score'] for p in preds])

    if i % (total // 10) == 0:
        print(f"Processed {i} / {total} rows...")

print("Sentiment prediction complete.")

df['sentiment'] = sentiments
df['confidence'] = confidences

Running sentiment predictions...
Processed 0 / 318 rows...
Sentiment prediction complete.


In [7]:
df.to_csv("instagram_labeled_output.csv", index=False)

## Sample Model Presentations

In [None]:
fig_sent = px.histogram(df, x='sentiment', color='sentiment',
color_discrete_map={0: 'red', 1: 'green'},
title='Sentiment Distribution (0=Negative, 1=Positive)',
labels={'sentiment': 'Sentiment'})
fig_sent.show()

fig_conf = px.histogram(df, x='confidence', nbins=30,
title='Model Confidence Distribution',
labels={'confidence': 'Prediction Confidence'})
fig_conf.show()