In [1]:
from gnews import GNews
import pandas as pd
import numpy as np
from newspaper import Article
import re
import time

## Using GNews to collect news article headlines

https://github.com/ranahaani/GNews   

In [None]:
start = time.time()
# generate dates in 1 week intervals from March 01, 2024 to May 23rd, 2024
dates = pd.date_range(start='3/1/2024', end='5/23/2024', freq='1W')
# turn into tuples
dates = [(date.year, date.month, date.day) for date in dates]


# for each date interval, search and collect data
for i in range(len(dates) - 1): # stop before the last date
    
    # create gnews object 
    news = GNews(language='en', 
                    country='US', 
                    start_date= dates[i], 
                    end_date=dates[i + 1], 
                    max_results=100, 
                   )
    # get news 
    results = news.get_news('student protest')
    
    # if this is the first iteration
    if i == 0: 
        df = pd.DataFrame(results)
    
    
    # else add to existing df 
    else: 
        new_df = pd.DataFrame(results)
        df = pd.concat([df, new_df])
    
    # increment i 
    i += 1

end = time.time()
print('time elapsed: ', end - start, ' seconds')
df.head()

### Data Cleaning

Our Approach: how do we determine if an article is about student protest?

#### Part 1: 

If it contains three components: 

1.) has student related keywords

2.) has protest related keywords

3.) has context related keyword

#### Part 2: 

For headlines with low score, get the first 250 characters of the article using the newspaper package to verify its topic. If the score is still low, we drop it. 

`student-related words:` 
- student*
- college*
- universit*
- school*
- campus*
- faculty

`protest-related words:`
- activis*
- protest*
- encampment
- demonstrat*
- clash
- divest*
- \*war\*

`context-related words:` 
- pro-palestin*
- Israel-Hamas
- Gaza
- pro-israel

In [None]:
keywords = pd.read_excel('news and keywords.xlsx')
keywords = keywords.loc[:,['Keywords (students)','Keywords (protest)', 'Keywords (context)']]
keywords

In [None]:
# read in dataset 
df = pd.read_csv('larger_test_data.csv')

In [None]:
keywords = pd.read_excel('news and keywords.xlsx')#['Keywords (v2)'].dropna().to_list()
keywords = keywords.loc[:,['Keywords (students)','Keywords (protest)', 'Keywords (context)']]
keywords.columns = ['student', 'protest', 'context']

# function to score each article
def get_score(text, url):
    
    # initialize dictionary to track subscores
    scores = {
        'student': False,
        'protest': False,
        'context': False
    }
    
    def preprocess(text): 
        # remove everything but words and hyphenated words from text, then turn into list of words
        text = text.strip().lower()
        text = re.sub(r'\W+', ' ', text).split(' ')
        return text
        
        
    def word_score(word): 
        for cat in keywords.columns:
            # get keywords
            keys = keywords[f'{cat}'].dropna().to_list()
            # match to a category 
            for key in keys: 
                pattern = re.compile(fr'{key}')
                match = pattern.fullmatch(word)
                if match: 
                    # check category and only change if it hasn't been checked before 
                    if scores[f'{cat}'] == False: 
                        scores[f'{cat}'] = True
    # preprocessing 
    text = preprocess(text)
    
    for word in text: 
        word_score(word)
        
    # finish this later
    curr_score = sum(scores.values())
    if curr_score > 0 and curr_score < 3:
        try: 
            # get first two sentences of article
            article = Article(url)

            # preprocessing
            article.download()
            article.parse()

            # get the first 250 characters
            new_text = article.text[:250]
            new_text = preprocess(new_text)

            for word in new_text:
                word_score(word)
        except: 
            print('issue with retrieving article: ', url)
        
         
    return sum(scores.values()) #student_score, protest_score, context_score

# create a score column onto the data we collected
df['score'] = df.apply(lambda row: get_score(row['title'], row['url']), axis=1)

In [None]:
threes = test[test['score'] == 3]
threes.groupby('url').first().reset_index()

In [None]:
print('filtered out', df.shape[0] - threes.shape[0], 'articles')

In [None]:
threes.to_csv('final_dataset.csv')

In [None]:
test.to_csv('unfiltered.csv')

# Collecting from Specific News Sites

In [2]:
sites = pd.read_excel('news and keywords.xlsx').loc[:,['News Sites (v2)', 'Score (v2)']].dropna()
sites.head()

Unnamed: 0,News Sites (v2),Score (v2)
0,cnn,-1.0
1,apnews,-0.5
2,wsj,0.5
3,ft,-0.5
4,nbcnews,-1.0


In [10]:
to_collect

['breitbart',
 'hannity',
 'theblaze',
 'heritage',
 'washingtonexaminer',
 'dailywire',
 'thefederalist',
 'thegatewaypundit',
 'dailycaller',
 'infowars',
 'stanfordreview',
 'thenewamerican',
 'prntly']

In [None]:
['breitbart',
 'hannity',
 'theblaze',
 'washingtonexaminer',
 'dailywire',
 'thefederalist',
 'thegatewaypundit',
 'dailycaller',
 'infowars',
 'thenewamerican',
 'prntly']

In [11]:
keywords = pd.read_excel('news and keywords.xlsx')
keywords = keywords.loc[:,['Keywords (students)','Keywords (protest)', 'Keywords (context)']]
keywords.columns = ['student', 'protest', 'context']

to_collect = sites.iloc[32:]['News Sites (v2)'].to_list()
to_collect = ['breitbart',
 'hannity',
 'theblaze',
 'washingtonexaminer',
 'dailywire',
 'thefederalist',
 'thegatewaypundit',
 'dailycaller',
 'infowars',
 'thenewamerican',
 'prntly']

# generate dates in 1 week intervals from April 01, 2024 to May 23rd, 2024
dates = pd.date_range(start='4/1/2024', end='5/23/2024', freq='1W')
# turn into tuples
dates = [(date.year, date.month, date.day) for date in dates]


# function to score each article
def simple_score(text):
    
    # initialize dictionary to track subscores
    scores = {
        'student': False,
        'protest': False,
        'context': False
    }
    
    def preprocess(text): 
        # remove everything but words and hyphenated words from text, then turn into list of words
        text = text.strip().lower()
        text = re.sub(r'\W+', ' ', text).split(' ')
        return text
        
        
    def word_score(word): 
        for cat in keywords.columns:
            # get keywords
            keys = keywords[f'{cat}'].dropna().to_list()
            # match to a category 
            for key in keys: 
                pattern = re.compile(fr'{key}')
                match = pattern.fullmatch(word)
                if match: 
                    # check category and only change if it hasn't been checked before 
                    if scores[f'{cat}'] == False: 
                        scores[f'{cat}'] = True
    # preprocessing 
    text = preprocess(text)
    
    for word in text: 
        word_score(word)
        
         
    return sum(scores.values())


def collect_news(news_site): 
    print('starting ', news_site)
    # for each date interval, search and collect data
    for i in range(len(dates) - 1): # stop before the last date

        # create gnews object 
        news = GNews(language='en', 
                        country='US', 
                        start_date= dates[i], 
                        end_date=dates[i + 1], 
                        max_results=50, 
                       )
        # get news 
        results = news.get_news_by_site(f'{news_site}.com')
        
        # if this is the first iteration
        if i == 0: 
            to_return = pd.DataFrame(results)


        # else add to existing df 
        else: 
            sub_results = pd.DataFrame(results)
            to_return = pd.concat([to_return, sub_results])

        # increment i 
        i += 1
        
    to_return['score'] = to_return['title'].apply(simple_score)
    to_return = to_return[to_return['score'] == 3]
    
    
    return to_return

for i in range(len(to_collect)): 
    site_df = collect_news(to_collect[i])

    if i == 0: 
        final_df = site_df
    
    else: 
        final_df = pd.concat([final_df, site_df])
    
    
final_df.head()

starting  breitbart
starting  hannity
starting  theblaze
starting  washingtonexaminer
starting  dailywire
starting  thefederalist
starting  thegatewaypundit
starting  dailycaller
starting  infowars
starting  thenewamerican
starting  prntly


KeyError: 'title'

In [19]:
old_df = pd.read_csv('final_dataset.csv', index_col=0)
new_df = pd.concat([old_df, final_df])
new_df.to_csv('final_dataset.csv')