# Midterm - Israel-Palestine Conflict

## Setup

In [138]:
# Import packages and load model
import spacy
from collections import Counter
from spacy.matcher import Matcher
from spacy.language import Language
from spacy.tokens import Doc, Token
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

# nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_md")

# Load Dataset

In [58]:
news = pd.read_csv("data/news_data.csv")

# View Shape
print(news.shape)

# Getter Extension: Remove stop words and punctuation
def get_clean_tokens(doc):
    return [token for token in doc if not token.is_stop and not token.is_punct]

# Remove U+000AD soft-hyphen
def remove_hyphen(text):
    return text.replace("\xad", "")


(3338, 3)


In [133]:
# Add Type, Date, and Sentiment Extensions
Doc.set_extension("type", default=None)
Doc.set_extension("date", default=None)
Doc.set_extension("sentiment", default=None)

In [96]:
# Get Headline and Description columns with date separately
headline_texts = news[['headline', 'date']].values.tolist()
descriptions = news[['description', 'date']].values.tolist()

In [100]:
# Remove U+000AD soft-hyphen
for row in headline_texts:
    row[0] = remove_hyphen(row[0])

for row in descriptions:
    row[0] = remove_hyphen(row[0]) 

In [134]:
# Process Headlines and Descriptions into pipeline with type, date, and sentiment context
sentiment_analyzer = SentimentIntensityAnalyzer()
docs = []
for doc, date in nlp.pipe(headline_texts, as_tuples=True):
    doc._.type = "headline"
    doc._.date = date
    doc._.sentiment = sentiment_analyzer.polarity_scores(doc.text)
    docs.append(doc)
    
for doc, date in nlp.pipe(descriptions, as_tuples=True):
    doc._.type = "description"
    doc._.date = date
    doc._.sentiment = sentiment_analyzer.polarity_scores(doc.text)
    docs.append(doc)    

# Get Word Counts

In [135]:
# Count unique words in headlines and descriptions
word_counts = Counter()

for doc in docs:
    word_counts.update([token.text for token in get_clean_tokens(doc)])
    
word_counts.most_common(20)

[('Israel', 2475),
 ('Gaza', 2208),
 ('Israeli', 1759),
 ('Palestinian', 1043),
 ('Palestinians', 811),
 ('war', 794),
 ('Hamas', 627),
 ('killed', 519),
 ('says', 505),
 ('West', 427),
 ('Bank', 407),
 ('forces', 354),
 ('occupied', 330),
 ('Al', 318),
 ('attack', 290),
 ('Palestine', 288),
 ('UN', 282),
 ('attacks', 260),
 ('day', 229),
 ('amid', 221)]

# #1.1 - Looking for Bias: Word Usage

In [115]:
bias_words = pd.DataFrame(word_counts.items(), columns=["word", "count"])

filter_list = ['UN', 'Israel', 'Israeli', 'Palestine', 'Palestinian', 'Palestinians', 'Jewish', 'Hamas', 'Gaza', 'Netanyahu', 'Jerusalem', 'West', 'Bank', 'IDF']

bias_filtered = bias_words[bias_words["word"].isin(filter_list)].sort_values(by="count", ascending=False)
bias_filtered

Unnamed: 0,word,count
4,Israel,2475
0,Gaza,2208
25,Israeli,1759
97,Palestinian,1043
10,Palestinians,811
92,Hamas,627
48,West,427
348,Bank,407
135,Palestine,288
39,UN,282


# #1.2 - Looking for Bias: Adjectives 

In [119]:
# Create adjective word matcher
adj_matcher = Matcher(nlp.vocab)

# Match on adjective words before and after named entities
pattern = [
    [{"POS": "ADJ"}, {"POS": "PROPN"}],
    [{"POS": "PROPN"}, {"POS": "ADJ"}],
    [{"POS": "ADJ"}, {"POS": "NOUN"}],
    [{"POS": "NOUN"}, {"POS": "ADJ"}],
]

adj_matcher.add("ADJ_PATTERN", pattern)

# Find most common adjectives
adj_matches_found = Counter()

for doc in docs:
    matches = adj_matcher(doc)
    if matches:
        for match_id, start, end in matches:
            span = doc[start:end]
            adj_matches_found.update([span.text])
            
adj_matches_found.most_common(20)


[('Israeli forces', 320),
 ('Israeli army', 102),
 ('Israeli air', 94),
 ('Israeli attacks', 69),
 ('key events', 68),
 ('Israeli police', 54),
 ('Israeli attack', 53),
 ('occupied West', 43),
 ('Israeli raid', 40),
 ('Palestinian prisoners', 39),
 ('main developments', 39),
 ('southern Gaza', 36),
 ('Palestinian man', 36),
 ('Israeli raids', 34),
 ('Israeli troops', 33),
 ('Israeli soldiers', 33),
 ('Israeli bombardment', 33),
 ('Israeli settlers', 33),
 ('Palestinian children', 32),
 ('northern Gaza', 30)]

# #2 - Analyzing Average Sentiment

In [139]:
# Get avg compound sentiment for headlines and descriptions
headline_compound_sentiment = [doc._.sentiment["compound"] for doc in docs if doc._.type == "headline"]
description_compound_sentiment = [doc._.sentiment["compound"] for doc in docs if doc._.type == "description"]

avg_headline_sentiment = sum(headline_compound_sentiment) / len(headline_compound_sentiment)
avg_description_sentiment = sum(description_compound_sentiment) / len(description_compound_sentiment)

print(f"Average Headline Sentiment: {avg_headline_sentiment}")
print(f"Average Description Sentiment: {avg_description_sentiment}")


Average Headline Sentiment: -0.2993361593768712
Average Description Sentiment: -0.3042775913720772


In [142]:
# Plot sentiment over time
# dates = [doc._.date for doc in docs]
# sentiments = [doc._.sentiment["compound"] for doc in docs]

# sentiment_df = pd.DataFrame({"date": dates, "sentiment": sentiments})

# sentiment_df["date"] = pd.to_datetime(sentiment_df["date"])
# sentiment_df = sentiment_df.set_index("date")

# fig, ax = plt.subplots()
# ax.plot(sentiment_df.index, sentiment_df["sentiment"])