In [1]:
from gnews import GNews
import pandas as pd
import numpy as np
from newspaper import Article
import re
import time

## Using GNews to collect news article headlines

https://github.com/ranahaani/GNews   

In [78]:
start = time.time()
# generate dates in 1 week intervals from March 01, 2024 to May 23rd, 2024
dates = pd.date_range(start='3/1/2024', end='5/23/2024', freq='1W')
# turn into tuples
dates = [(date.year, date.month, date.day) for date in dates]


# for each date interval, search and collect data
for i in range(len(dates) - 1): # stop before the last date
    
    # create gnews object 
    news = GNews(language='en', 
                    country='US', 
                    start_date= dates[i], 
                    end_date=dates[i + 1], 
                    max_results=100, 
                   )
    # get news 
    results = news.get_news('student protest')
    
    # if this is the first iteration
    if i == 0: 
        df = pd.DataFrame(results)
    
    
    # else add to existing df 
    else: 
        new_df = pd.DataFrame(results)
        df = pd.concat([df, new_df])
    
    # increment i 
    i += 1

end = time.time()
print('time elapsed: ', end - start, ' seconds')
df.head()

time elapsed:  146.72764205932617  seconds


Unnamed: 0,title,description,published date,url,publisher
0,How a student protest plan helped get a playgr...,How a student protest plan helped get a playgr...,"Mon, 04 Mar 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiaGh0d...,"{'href': 'https://ny1.com', 'title': 'Spectrum..."
1,Pro-Palestine student organizers face crackdow...,Pro-Palestine student organizers face crackdow...,"Tue, 05 Mar 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiW2h0d...,"{'href': 'https://prismreports.org', 'title': ..."
2,"Students, staff and faculty protest DEI decisi...","Students, staff and faculty protest DEI decisi...","Fri, 08 Mar 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMifmh0d...,"{'href': 'https://www.alligator.org', 'title':..."
3,Students protest DEI firings at the University...,Students protest DEI firings at the University...,"Fri, 08 Mar 2024 08:00:00 GMT",https://news.google.com/rss/articles/CBMiXGh0d...,"{'href': 'https://abcnews.go.com', 'title': 'A..."
4,"150 students take part in pro-Palestine, pro-I...","150 students take part in pro-Palestine, pro-I...","Fri, 08 Mar 2024 23:20:24 GMT",https://news.google.com/rss/articles/CBMipQFod...,"{'href': 'https://www.oudaily.com', 'title': '..."


### Data Cleaning

Our Approach: how do we determine if an article is about student protest?

#### Part 1: 

If it contains three components: 

1.) has student related keywords

2.) has protest related keywords

3.) has context related keyword

#### Part 2: 

For headlines with low score, get the first 250 characters of the article using the newspaper package to verify its topic. If the score is still low, we drop it. 

`student-related words:` 
- student*
- college*
- universit*
- school*
- campus*
- faculty

`protest-related words:`
- activis*
- protest*
- encampment
- demonstrat*
- clash
- divest*
- \*war\*

`context-related words:` 
- pro-palestin*
- Israel-Hamas
- Gaza
- pro-israel

In [6]:
keywords = pd.read_excel('news and keywords.xlsx')
keywords = keywords.loc[:,['Keywords (students)','Keywords (protest)', 'Keywords (context)']]
keywords

Unnamed: 0,Keywords (students),Keywords (protest),Keywords (context)
0,sjp,activis.*,palestin.*
1,student.*,protest.*,israel.*
2,universit.*,encamp.*,gaza
3,college.*,demonstrat.*,sjp
4,school.*,clash.*,
...,...,...,...
61,,,
62,,,
63,,,
64,,,


In [140]:
# read in dataset 
df = pd.read_csv('larger_test_data.csv')

In [101]:
keywords = pd.read_excel('news and keywords.xlsx')#['Keywords (v2)'].dropna().to_list()
keywords = keywords.loc[:,['Keywords (students)','Keywords (protest)', 'Keywords (context)']]
keywords.columns = ['student', 'protest', 'context']

# function to score each article
def get_score(text, url):
    
    # initialize dictionary to track subscores
    scores = {
        'student': False,
        'protest': False,
        'context': False
    }
    
    def preprocess(text): 
        # remove everything but words and hyphenated words from text, then turn into list of words
        text = text.strip().lower()
        text = re.sub(r'\W+', ' ', text).split(' ')
        return text
        
        
    def word_score(word): 
        for cat in keywords.columns:
            # get keywords
            keys = keywords[f'{cat}'].dropna().to_list()
            # match to a category 
            for key in keys: 
                pattern = re.compile(fr'{key}')
                match = pattern.fullmatch(word)
                if match: 
                    # check category and only change if it hasn't been checked before 
                    if scores[f'{cat}'] == False: 
                        scores[f'{cat}'] = True
    # preprocessing 
    text = preprocess(text)
    
    for word in text: 
        word_score(word)
        
    # finish this later
    curr_score = sum(scores.values())
    if curr_score > 0 and curr_score < 3:
        try: 
            # get first two sentences of article
            article = Article(url)

            # preprocessing
            article.download()
            article.parse()

            # get the first 250 characters
            new_text = article.text[:250]
            new_text = preprocess(new_text)

            for word in new_text:
                word_score(word)
        except: 
            print('issue with retrieving article: ', url)
        
         
    return sum(scores.values()) #student_score, protest_score, context_score

# create a score column onto the data we collected
df['score'] = df.apply(lambda row: get_score(row['title'], row['url']), axis=1)

issue with retrieving article:  https://news.google.com/rss/articles/CBMia2h0dHBzOi8vandlZWtseS5jb20vMjAyNC8wMy8wNS9zZW5zYXRpb25hbGl6aW5nLWNhbXB1cy1hbnRpc2VtaXRpc20taXNudC1zZXJ2aW5nLWpld2lzaC1zdHVkZW50cy1saWtlLW1pbmUv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMiWWh0dHBzOi8vd3d3Lm5ld3MxMC5jb20vdmlkZW8vc3R1ZGVudHMtcGFyZW50cy1wcm90ZXN0LW91dHNpZGUtc2Nob29sLWluLWNhdHNraWxsLzk0ODQ1MDkv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMid2h0dHBzOi8vd2dudHYuY29tL3dlc3Rlcm4tc3VidXJicy9zdHVkZW50cy1wcm90ZXN0LWF0LWZlbnRvbi1oaWdoLXNjaG9vbC1hbWlkLXNleHVhbC1hYnVzZS1hbGxlZ2F0aW9ucy1hZ2FpbnN0LXN0YWZmZXIv0gEA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vd3d3Lm5ld3MxMC5jb20vbmV3cy9jYXRza2lsbC1zdHVkZW50cy1hbmQtcGFyZW50cy1wcm90ZXN0LW91dHNpZGUtc2Nob29sL9IBVWh0dHBzOi8vd3d3Lm5ld3MxMC5jb20vbmV3cy9jYXRza2lsbC1zdHVkZW50cy1h

issue with retrieving article:  https://news.google.com/rss/articles/CBMiYGh0dHBzOi8vd3d3LnVzZm9yYWNsZS5jb20vMjAyNC8wMy8xOC91c2YtcHJvLXBhbGVzdGluZS1odW5nZXItc3RyaWtlcnMtcmVtb3ZlZC1mcm9tLWJvdC1tZWV0aW5nL9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMiQWh0dHBzOi8vd29ybGRjcnVuY2guY29tL2N1bHR1cmUtc29jaWV0eS95b3V0aHMtY2hhbmdpbmctdGhlLXdvcmxk0gEA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMiZWh0dHBzOi8vd3d3LmxvdWlzdmlsbGVjYXJkaW5hbC5jb20vMjAyNC8wMy9zdHVkZW50cy1wcm90ZXN0LWF0dGFjay1vbi1kaXZlcnNpdHktZXF1aXR5LWFuZC1pbmNsdXNpb24v0gEA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMifGh0dHBzOi8vd3d3LmNhbXB1c3JlZm9ybS5vcmcvYXJ0aWNsZS91bml2ZXJzaXR5LW1lbXBoaXMtcHJvdGVzdGVycy1ibG9jay1zdHVkZW50cy1sZWF2aW5nLWt5bGUtcml0dGVuaG91c2UtZXZlbnQtdmlkZW8vMjUwNTfSAQA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://new

issue with retrieving article:  https://news.google.com/rss/articles/CBMifGh0dHBzOi8vd3d3LnNlYXR0bGV0aW1lcy5jb20vb3Bpbmlvbi9lZGl0b3JpYWxzL3V3LW11c3QtdGFrZS1hY3Rpb24tYWZ0ZXItcHJvdGVzdGVycy12YW5kYWxpemUtcHJvcGVydHktaW50aW1pZGF0ZS1zdHVkZW50cy_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMigQFodHRwczovL2hvb2RsaW5lLmNvbS8yMDI0LzA0L2Jvc3Rvbi1lcnVwdHMtaW4tc3R1ZGVudC1wcm90ZXN0cy1hcy1idS1oYXJ2YXJkLWFuZC1taXQtcmFsbHktaW4tc3VwcG9ydC1vZi1hcnJlc3RlZC1jb2x1bWJpYS1wZWVycy_SAQA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMib2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL3VzL3NvbWUtY29sdW1iaWEtc3R1ZGVudHMtcHJvdGVzdC1lbmNhbXBtZW50LWlzLWxpdmluZy1oaXN0b3J5LWxlc3Nvbi0yMDI0LTA0LTI3L9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en
issue with retrieving article:  https://news.google.com/rss/articles/CBMiYGh0dHBzOi8vd3d3LnBvbGl0aWNvLmNvbS9uZXdzLzIwMjQvMDQvMjYvYmlkZW4tY29uZGVtbnMtY29sdW1iaWEtc3R1ZGVudC1wcm90ZXN0

In [108]:
threes = test[test['score'] == 3]
threes.groupby('url').first().reset_index()

In [144]:
print('filtered out', df.shape[0] - threes.shape[0], 'articles')

filtered out 583 articles


In [132]:
threes.to_csv('final_dataset.csv')

In [147]:
test.to_csv('unfiltered.csv')

# Collecting from Specific News Sites

In [3]:
sites = pd.read_excel('news and keywords.xlsx').loc[:,['News Sites (v2)', 'Score (v2)']].dropna()
sites.head()

Unnamed: 0,News Sites (v2),Score (v2)
0,cnn,-1.0
1,apnews,-0.5
2,wsj,0.5
3,ft,-0.5
4,nbcnews,-1.0


In [9]:
to_collect = sites.iloc[32:]['News Sites (v2)'].to_list()

In [20]:
start = time.time()
# generate dates in 1 week intervals from March 01, 2024 to May 23rd, 2024
dates = pd.date_range(start='4/1/2024', end='5/23/2024', freq='1W')
# turn into tuples
dates = [(date.year, date.month, date.day) for date in dates]


# for each date interval, search and collect data
for i in range(len(dates) - 1): # stop before the last date
    
    # create gnews object 
    news = GNews(language='en', 
                    country='US', 
                    start_date= dates[i], 
                    end_date=dates[i + 1], 
                    max_results=50, 
                   )
    # get news 
    results = news.get_news_by_site('breitbart.com')
    
    # if this is the first iteration
    if i == 0: 
        df = pd.DataFrame(results)
    
    
    # else add to existing df 
    else: 
        new_df = pd.DataFrame(results)
        df = pd.concat([df, new_df])
    
    # increment i 
    i += 1

end = time.time()
print('time elapsed: ', end - start, ' seconds')
df.head()

time elapsed:  43.586013317108154  seconds


Unnamed: 0,title,description,published date,url,publisher
0,Number One Scottie Scheffler Says Jesus Define...,Number One Scottie Scheffler Says Jesus Define...,"Sat, 13 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiaGh0d...,"{'href': 'https://www.breitbart.com', 'title':..."
1,"Gimme Shelter: Inflation Pushing Up Rents, Gar...","Gimme Shelter: Inflation Pushing Up Rents, Gar...","Wed, 10 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMidWh0d...,"{'href': 'https://www.breitbart.com', 'title':..."
2,Left Spent Decade 'Trying to Shut Women up' Sa...,Left Spent Decade 'Trying to Shut Women up' Sa...,"Thu, 11 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMia2h0d...,"{'href': 'https://www.breitbart.com', 'title':..."
3,‘Real Housewives’ Star Lauri Peterson’s Son Jo...,‘Real Housewives’ Star Lauri Peterson’s Son Jo...,"Sun, 07 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMic2h0d...,"{'href': 'https://www.breitbart.com', 'title':..."
4,Kidflation: Daycare and School Prices Soar - B...,Kidflation: Daycare and School Prices Soar Br...,"Wed, 10 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiV2h0d...,"{'href': 'https://www.breitbart.com', 'title':..."


In [25]:
keywords = pd.read_excel('news and keywords.xlsx')#['Keywords (v2)'].dropna().to_list()
keywords = keywords.loc[:,['Keywords (students)','Keywords (protest)', 'Keywords (context)']]
keywords.columns = ['student', 'protest', 'context']

# function to score each article
def get_score(text, url):
    
    # initialize dictionary to track subscores
    scores = {
        'student': False,
        'protest': False,
        'context': False
    }
    
    def preprocess(text): 
        # remove everything but words and hyphenated words from text, then turn into list of words
        text = text.strip().lower()
        text = re.sub(r'\W+', ' ', text).split(' ')
        return text
        
        
    def word_score(word): 
        for cat in keywords.columns:
            # get keywords
            keys = keywords[f'{cat}'].dropna().to_list()
            # match to a category 
            for key in keys: 
                pattern = re.compile(fr'{key}')
                match = pattern.fullmatch(word)
                if match: 
                    # check category and only change if it hasn't been checked before 
                    if scores[f'{cat}'] == False: 
                        scores[f'{cat}'] = True
    # preprocessing 
    text = preprocess(text)
    
    for word in text: 
        word_score(word)
        
#     # finish this later
#     curr_score = sum(scores.values())
#     if curr_score > 0 and curr_score < 3:
#         try: 
#             # get first two sentences of article
#             article = Article(url)

#             # preprocessing
#             article.download()
#             article.parse()

#             # get the first 250 characters
#             new_text = article.text[:250]
#             new_text = preprocess(new_text)

#             for word in new_text:
#                 word_score(word)
#         except: 
#             print('issue with retrieving article: ', url)
        
         
    return sum(scores.values()) #student_score, protest_score, context_score

# create a score column onto the data we collected
df['score'] = df.apply(lambda row: get_score(row['title'], row['url']), axis=1)

In [19]:
to_collect

['breitbart',
 'hannity',
 'theblaze',
 'heritage',
 'washingtonexaminer',
 'dailywire',
 'thefederalist',
 'thegatewaypundit',
 'dailycaller',
 'infowars',
 'stanfordreview',
 'thenewamerican',
 'prntly']

In [36]:
df[df['score'] > 1].reset_index().iloc[8]['title']

'Panic as Automatic Sprinklers Soak Harvard’s Anti-Israel Encampment - Breitbart'

In [29]:
df[df['score'] > 1].reset_index()

Unnamed: 0,index,title,description,published date,url,publisher,score
0,20,Iran’s General Salami Lauds ‘Successful’ Attac...,Iran’s General Salami Lauds ‘Successful’ Attac...,"Sun, 14 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiggFod...,"{'href': 'https://www.breitbart.com', 'title':...",2
1,16,WATCH: Anti-Israel Protesters at Columbia Chan...,WATCH: Anti-Israel Protesters at Columbia Chan...,"Sun, 21 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMicWh0d...,"{'href': 'https://www.breitbart.com', 'title':...",2
2,18,WATCH: Anti-Israel Protesters at Yale Tear Dow...,WATCH: Anti-Israel Protesters at Yale Tear Dow...,"Sun, 21 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMibGh0d...,"{'href': 'https://www.breitbart.com', 'title':...",2
3,32,Iran’s General Salami Lauds ‘Successful’ Attac...,Iran’s General Salami Lauds ‘Successful’ Attac...,"Sun, 14 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiggFod...,"{'href': 'https://www.breitbart.com', 'title':...",2
4,5,Police: Maryland High School Athletic Director...,Police: Maryland High School Athletic Director...,"Fri, 26 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiiwFod...,"{'href': 'https://www.breitbart.com', 'title':...",2
5,15,West Virginia Middle School Girls Walk Out of ...,West Virginia Middle School Girls Walk Out of ...,"Mon, 22 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMiggFod...,"{'href': 'https://www.breitbart.com', 'title':...",2
6,18,Argentina Requests Arrest of Iranian Interior ...,Argentina Requests Arrest of Iranian Interior ...,"Wed, 24 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMijQFod...,"{'href': 'https://www.breitbart.com', 'title':...",2
7,19,Sen. John Fetterman Slams ‘A**Hole’ Anti-Israe...,Sen. John Fetterman Slams ‘A**Hole’ Anti-Israe...,"Mon, 22 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMidGh0d...,"{'href': 'https://www.breitbart.com', 'title':...",2
8,20,Panic as Automatic Sprinklers Soak Harvard’s A...,Panic as Automatic Sprinklers Soak Harvard’s A...,"Thu, 25 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMicmh0d...,"{'href': 'https://www.breitbart.com', 'title':...",2
9,28,Columbia Moves to Virtual Classes as Pro-Pales...,Columbia Moves to Virtual Classes as Pro-Pales...,"Mon, 22 Apr 2024 07:00:00 GMT",https://news.google.com/rss/articles/CBMidWh0d...,"{'href': 'https://www.breitbart.com', 'title':...",2
