# Sentiment Analysis Kaoutar lakdim

In [None]:
!pip install -q vaderSentiment


In [None]:

import pandas as pd
import re, string, nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
# --- VADER: choose ONE of the next two imports --------------------
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer      # (offline-friendly)
# from nltk.sentiment import SentimentIntensityAnalyzer                  # <- use this if you prefer NLTK's copy
# ------------------------------------------------------------------

nltk.download("stopwords", quiet=True)   # only ~30 KB; no "punkt" needed

# ------------------------------------------------------------------
# 2) Load your tweets CSV (change the filename/column names as needed)
# ------------------------------------------------------------------
df = pd.read_csv("tweets-data.csv")      # or the file you exported earlier
df = df.sample(n=500, random_state=42)   # matches “take a sample of 500 rows”

# If your tweets column is named something else (e.g. "text"), edit here:
TWEET_COL = "Tweets"

# ------------------------------------------------------------------
# 3) Cleaning helper
# ------------------------------------------------------------------
URL_RX  = re.compile(r'https?://\S+|www\.\S+')
MENT_RX = re.compile(r'@\w+')
HASH_RX = re.compile(r'#(\w+)')          # keeps the word, drops the #

tokenizer = TweetTokenizer()
STOP = set(stopwords.words("english")).union({"rt"})  # add extra stop-words if you like

def clean_tweet(text: str) -> str:
    text = URL_RX.sub("", text)
    text = MENT_RX.sub("", text)
    text = HASH_RX.sub(r"\1", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = [tok.lower() for tok in tokenizer.tokenize(text)
              if tok.isalpha() and tok.lower() not in STOP]
    return " ".join(tokens)              # <- single “sentence” required by the brief

# ------------------------------------------------------------------
# 4) VADER wrapper → returns (label, compound score)
# ------------------------------------------------------------------
analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(sentence: str, neutral_threshold: float = 0.05):
    cscore = analyzer.polarity_scores(sentence)["compound"]
    if cscore >=  neutral_threshold:
        label = "Positive"
    elif cscore <= -neutral_threshold:
        label = "Negative"
    else:
        label = "Neutral"
    return pd.Series([label, cscore])

# ------------------------------------------------------------------
# 5) Apply the pipeline and create the two new columns
# ------------------------------------------------------------------
df["cleaned"] = df[TWEET_COL].astype(str).apply(clean_tweet)                     # step ②
df[["sentiment_label", "sentiment_score"]] = df["cleaned"].apply(vader_sentiment)  # step ③

# ------------------------------------------------------------------
# 6) Quick sanity-check
# ------------------------------------------------------------------
print(df[[TWEET_COL, "cleaned", "sentiment_label", "sentiment_score"]].head())


                                                 Tweets  \
2899  Le #DessinDePresse de Sanaga : ls sont morts c...   
594   #Russia #Wagner #RussiaCivilWar https://t.co/P...   
2870  Exclusive content -https://t.co/oEiSIIB2Z1\n.\...   
52    Auch heute geht die politische Nachricht des T...   
1391  @crazyclipsonly Same type that would take a ho...   

                                                cleaned sentiment_label  \
2899  le dessindepresse de sanaga ls sont morts comm...        Positive   
594                        russia wagner russiacivilwar         Neutral   
2870  exclusive content cosplay japan titan titanics...        Negative   
52    auch heute geht die politische nachricht des t...        Negative   
1391  type would take homemade playstationcontrolled...         Neutral   

      sentiment_score  
2899           0.4767  
594            0.0000  
2870          -0.4404  
52            -0.5994  
1391           0.0000  


In [None]:
## transfomers

In [None]:
!pip install transformers torch pandas nltk




In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from transformers import pipeline
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean_tweet(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    tokens = text.lower().split()
    cleaned = [word for word in tokens if word not in stop_words]
    return " ".join(cleaned)

df['cleaned_tweet'] = df['Tweets'].astype(str).apply(clean_tweet)

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
results = sentiment_pipeline(df['cleaned_tweet'].tolist(), truncation=True)

In [None]:
df['sentiment_label'] = [res['label'].lower() for res in results]
df['sentiment_score'] = [res['score'] for res in results]

In [None]:
print(df[['Tweets', 'cleaned_tweet', 'sentiment_label', 'sentiment_score']].head())

                                                 Tweets  \
2899  Le #DessinDePresse de Sanaga : ls sont morts c...   
594   #Russia #Wagner #RussiaCivilWar https://t.co/P...   
2870  Exclusive content -https://t.co/oEiSIIB2Z1\n.\...   
52    Auch heute geht die politische Nachricht des T...   
1391  @crazyclipsonly Same type that would take a ho...   

                                          cleaned_tweet sentiment_label  \
2899  le dessindepresse de sanaga ls sont morts comm...        negative   
594                        russia wagner russiacivilwar        negative   
2870  exclusive content cosplay japan titan titanics...        negative   
52    auch heute geht die politische nachricht des t...        negative   
1391  type would take homemade playstationcontrolled...        negative   

      sentiment_score  
2899         0.981537  
594          0.962062  
2870         0.961531  
52           0.975570  
1391         0.993684  


In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

Device set to use cpu


In [None]:
label_map = {
    "NEGATIVE": "negative",
    "NEUTRAL": "neutral",
    "POSITIVE": "positive"
}
df['sentiment_label'] = [label_map[res['label']] for res in results]
df['sentiment_score'] = [res['score'] for res in results]

In [None]:
print(results[:3])

[{'label': 'NEGATIVE', 'score': 0.9815365672111511}, {'label': 'NEGATIVE', 'score': 0.9620620608329773}, {'label': 'NEGATIVE', 'score': 0.9615312218666077}]
