In [5]:
import json
import requests
import time
import csv
from requests_oauthlib import OAuth1
from urllib.parse import parse_qs
from flickrapi import FlickrAPI
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import html
import re
from datetime import datetime

In [6]:
# Initialize VADER
sid = SentimentIntensityAnalyzer()

# Get the lexicon
vader_lexicon = sid.lexicon

# Convert to DataFrame for easier viewing
lexicon_df = pd.DataFrame.from_dict(vader_lexicon, orient='index', columns=['score'])

# Sort by score to see strongest sentiment words first
lexicon_df_sorted = lexicon_df.sort_values('score', ascending=False)

# Now you can see the words and their scores
print(lexicon_df_sorted)
lexicon_df_sorted.to_csv('vader_lexicon.csv')

               score
ilu              3.4
ily              3.4
aml              3.4
magnificently    3.4
sweetheart       3.3
...              ...
rape            -3.7
kill            -3.7
raping          -3.8
slavery         -3.8
rapist          -3.9

[7502 rows x 1 columns]


In [2]:
API_KEY = '6121804e178a34ebe49444e858987ee5'
API_SECRET = '0995d081c0eccf00'
API_END = 'https://api.flickr.com/services/rest/'
flickr = FlickrAPI(API_KEY, API_SECRET, format='parsed-json')
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joeyared/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
new_words = {
    # Negative sentiment terms
    'casualty': -0.6,
    'casualties': -0.6,
    'death': -0.8,
    'deaths': -0.8,
    'killed': -0.8,
    'killing': -0.8,
    'injured': -0.6,
    'wound': -0.6,
    'wounded': -0.6,
    'displaced': -0.5,
    'displacement': -0.5,
    'refugee': -0.4,
    'refugees': -0.4,
    'destruction': -0.7,
    'destroyed': -0.7,
    'damage': -0.5,
    'damaged': -0.5,
    'crisis': -0.6,
    'conflict': -0.4,
    'violence': -0.7,
    'violent': -0.7,
    'attack': -0.6,
    'attacks': -0.6,
    'siege': -0.6,
    'blockade': -0.5,
    'suffering': -0.7,
    'hostage': -0.8,
    'hostages': -0.8,
    
    # Positive sentiment terms
    'peace': 0.8,
    'peaceful': 0.7,
    'ceasefire': 0.6,
    'truce': 0.6,
    'negotiation': 0.5,
    'negotiations': 0.5,
    'diplomatic': 0.5,
    'diplomacy': 0.5,
    'agreement': 0.6,
    'resolution': 0.6,
    'dialogue': 0.6,
    'humanitarian': 0.5,
    'aid': 0.6,
    'assistance': 0.5,
    'support': 0.4,
    'relief': 0.5,
    'reconciliation': 0.7,
    'stability': 0.6,
    'stable': 0.5,
    'protect': 0.5,
    'protection': 0.5,
    'safety': 0.6,
    'safe': 0.6,
    'rebuild': 0.5,
    'rebuilding': 0.5,
    
    # Neutral but contextually important terms
    'civilian': 0.0,
    'civilians': 0.0,
    'hospital': 0.0,
    'infrastructure': 0.0,
    'border': 0.0,
    'borders': 0.0,
    'territory': 0.0,
    'security': 0.0,
    'government': 0.0,
    'military': 0.0
}
sid.lexicon.update(new_words)

In [3]:
def search_for_photos(keywords, start_date, end_date, num_images=400):
    start = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())
    end = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())
    photos = flickr.photos.search(tags=keywords,
                                  tag_mode='all',
                                  min_upload_date=start, 
                                  max_upload_date=end,
                                  per_page=num_images, 
                                  sort='date-posted-desc', 
                                  extras='date_upload')
    return [photo['id'] for photo in photos['photos']['photo']]

In [4]:
def analyze_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        sentiment = 'Positive'
    elif scores['compound'] <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    return sentiment, scores

In [5]:
def fetch_and_display_comments(image_ids):
    comment_data = []
    for photo_id in image_ids:
        params = {
            'method': 'flickr.photos.comments.getList',
            'photo_id': photo_id,
            'api_key': API_KEY,
            'format': 'json',
            'nojsoncallback': 1
        }
        try: 
            response = requests.get(API_END, params=params)
            response.raise_for_status()
            data = response.json()
            # print(data)
            comments = data.get('comments', {}).get('comment', [])
            
            if comments:  # Only print if there are comments
                for comment in comments:
                    
                    comment_text = html.unescape(comment.get('_content', ''))
                    comment_text = re.sub(r'https?://\S+', '', comment_text)  # http(s) URLs
                    comment_text = re.sub(r'www\.\S+', '', comment_text)      # www URLs
                    comment_text = re.sub(r'[\w\-]+(\.[\w\-]+)+\.\w+\S*', '', comment_text)  # other URL formats
                    
                    comment_text = re.sub(r'\[.*?\]$', '', comment_text)  # Remove [...] at end
                    comment_text = re.sub(r'\(.*?\)$', '', comment_text)  # Remove (...) at end
    
                    comment_text = re.sub(r'<[^>]+>', '', comment_text)  # HTML tags
                    comment_text = re.sub(r'</?\w+[^>]*>', '', comment_text)  #  remaining tags
                    comment_text = re.sub(r'&[a-z]+;', '', comment_text)  # HTML entities
                    comment_text = re.sub(r'[<>]', '', comment_text)  # stray brackets
                    comment_text = re.sub(r'alt=[\'"]\S+[\'"]', '', comment_text)  # alt attributes
                    comment_text = re.sub(r'/a>', '', comment_text)  # closing a tags
                    comment_text = re.sub(r'[^\x00-\x7F]+', '', comment_text) # non-ASCII
                    comment_text = ' '.join(comment_text.split())
                    comment_text = comment_text.strip()

                    if not comment_text or not comment_text[0].isalpha():
                        continue
                        
                    if comment_text and len(comment_text) > 3:
                        sentiment, score = analyze_sentiment(comment_text)
                        comment_data.append({
                            'photo_id': photo_id,
                            'author': comment.get('authorname', ''),
                            'date': comment.get('datecreate', ''),
                            'comment_text': comment_text, 
                            'sentiment': sentiment, 
                            'sentiment_score': score['compound']
                        })
        except requests.exceptions.RequestException as e:
            print(f"Error fetching comments for photo ID {photo_id}: {e}")
    if not comment_data:
        print("No comments found for any of the photos")
    comment_df = pd.DataFrame(comment_data, columns=['photo_id','author','date','comment_text','sentiment','sentiment_score'])
    comment_df['date'] = pd.to_datetime(comment_df['date'], unit='s', errors='coerce').dt.strftime('%Y-%m-%d')
    return comment_df

In [6]:
def main():
    # nltk.download('vader_lexicon')
    # sid = SentimentIntensityAnalyzer()
    keywords = ['gaza', 'palestine', 'palestinian refugees', 'israel palestine conflict', 'isreal', 'hamas', 'gaza protests']
    starts = '2023-01-01'
    ends = '2024-11-01'
    all_data = []
    for search_word in keywords:
        print(f"Searching for: {search_word}")
        image_ids = search_for_photos(search_word, starts, ends)
        
        if image_ids:
            df = fetch_and_display_comments(image_ids)
            all_data.append(df) 
        else:
            print(f"No images found for search term: {search_word}")
    
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        combined_df.to_csv("flickr_comments_combined.csv", index=False)
        print("Results saved to flickr_comments_combined.csv")
    else:
        print("No data collected for any search terms.")
    # search_word = 'Gaza'
    # starts = '2023-01-01'
    # ends = '2024-11-01'
    # image_ids = search_for_photos(search_word, starts, ends)

    
    # if image_ids:
    #     df = fetch_and_display_comments(image_ids)
    #     display(df)
    #     df.to_csv("flickr_comments.csv", index=False)
    # else:
    #     print("no images found for this search term")

In [7]:
if __name__ == "__main__":
    main()

Searching for: gaza
Searching for: palestine
Searching for: palestinian refugees
No comments found for any of the photos
Searching for: israel palestine conflict
No comments found for any of the photos
Searching for: isreal
Searching for: hamas
Searching for: gaza protests
No comments found for any of the photos
Results saved to flickr_comments_combined.csv
