# Data Collection
This part contains the extraction and cleaning of data from three Twitter news accounts. 

In [None]:
import tweepy

In [None]:
import pandas as pd 

### Extracting tweets from Twitter API using tweepy

In [None]:
client = tweepy.Client(bearer_token='AAAAAAAAAAAAAAAAAAAAAMwJWwEAAAAA9fboJE9mqyS5sH90WY6rfAFBvPw%3Dw3wtOF3C77Os6E25qJJzp1n16PbFt864V6B4vWC6OCsh02ARct')

In [None]:
cnn_breaking_news_id = '428333'

In [None]:
bbc_breaking_news_id = '5402612'

In [None]:
reuters_id = '1652541'

#### Use paginator to retrieve the 3200 tweets for the account
Populate the tweet and the information collected into a list 

In [None]:
    cnn_tweets = tweepy.Paginator(client.get_users_tweets, id=cnn_breaking_news_id, max_results=100, tweet_fields=['context_annotations','created_at','public_metrics']).flatten(limit=3200)

In [1]:
bbc_tweets = tweepy.Paginator(client.get_users_tweets, id=bbc_breaking_news_id, max_results=100, tweet_fields=['context_annotations','created_at','public_metrics']).flatten(limit=3200)

NameError: name 'tweepy' is not defined

In [None]:
reu_tweets = tweepy.Paginator(client.get_users_tweets, id=reuters_id, max_results=100, tweet_fields=['context_annotations','created_at','public_metrics']).flatten(limit=3200)

In [None]:
cnn_all_tweets = []
for tweet in cnn_tweets:
    cnn_all_tweets.append(tweet)

In [None]:
bbc_all_tweets = []
for tweet in bbc_tweets:
    bbc_all_tweets.append(tweet)

In [None]:
reu_all_tweets = []
for tweet in reu_tweets:
    reu_all_tweets.append(tweet)

#### Create dataframe that saves the information of the tweet

In [None]:
"""
This function creates dataframe that takes in the list of all tweets collected using the Tweepy Paginator. 

It has columns that contain different information from the tweet, including
"tweet text" - the text of the Tweet
'# likes' - # of likes the Tweet had (at the time of retrieval)
'# retweets' - # of retweets the Tweet had (at the time of retrieval)
'# replies' - # of replies the Tweet had (at the time of retrieval)
'# quotes' - # of quote retweets the Tweet had (at the time of retrieval)
'links' - the hyperlink of the tweet that links to the full news article
"""

def create_df(all_tweets):
    df = pd.DataFrame(all_tweets)
    
    #create new column
    df["tweet text"] = ""
    df["# likes"] = ""
    df["# retweets"] = ""
    df["# replies"] = ""
    df["# quotes"] = ""
    df["links"] = ""
    
    for index,row in df.iterrows():
        df.iloc[index,5] = df.iloc[index,4].replace('\n','').split("http")[0] #tweet text - everything before the href link
        df.iloc[index,6] = df.iloc[index,3]["like_count"] # like count
        df.iloc[index,7] = df.iloc[index,3]["retweet_count"] # retweets count
        df.iloc[index,8] = df.iloc[index,3]["reply_count"] # reply count
        df.iloc[index,9] = df.iloc[index,3]["quote_count"] # quote count
        
    df["# Reactions"] = df["# likes"] + df["# retweets"] + df["# replies"] + df["# quotes"]
    
    return df

In [None]:
cnn_df = create_df(cnn_all_tweets)

In [None]:
bbc_df = create_df(bbc_all_tweets)

In [None]:
reu_df = create_df(reu_all_tweets)

### After every section, save the newest dataframe to csv to ensure that files are being saved, so we don't need to run the code again all the time. 

In [None]:
cnn_df.to_csv("CNN Breaking New.csv")

In [None]:
bbc_df.to_csv("BBC Breaking News.csv")

In [None]:
reu_df.to_csv("Reuters.csv")

### Open Saved CSV Files

In [None]:
cnn_df = pd.read_csv("CNN Breaking News.csv")

In [None]:
bbc_df = pd.read_csv("BBC Breaking News.csv")

In [None]:
reu_df = pd.read_csv("Reuters.csv")

### Extract the URL Links from the tweet text
The news tweet text contains two parts: <br> 1) The tweet text itself (string format) <br> 2) A hyperlink (in string format) that takes you to the full article of the news <br> To extract the hyperlink from the tweet text, we have to use the urlextract library that can find the hyperlink directly. We collected all of the links and add it to the dataframe

In [None]:
pip install urlextract

In [None]:
from urlextract import URLExtract

In [None]:
bbc_df = bbc_df.drop(columns = ['Unnamed: 0']) #drop colummn that exist when importing csv
reu_df = reu_df.drop(columns = ['Unnamed: 0']) 
cnn_df = reu_df.drop(columns = ['Unnamed: 0']) 

In [None]:
"""This function extract the url from the tweet text, from an input of dataframe
This doesn't work if the tweet text is a retweet, as it won't trace back to the original tweet. 
But can still collect countries mentioned in the text."""

def extractURL(dataframe):
    #try and except for errors that can occur, such as when no links are mentioned in the tweet
    try:
        for index, row in dataframe.iterrows():
            text = dataframe.at[index,"tweet text"]
            extractor = URLExtract()
            url = extractor.find_urls(text)
            dataframe.at[index,"links"] = url
    except:
        pass
        
    return dataframe

In [None]:
"""This functions is unique for the tweets from Retuers. 
For the other two news account, their tweets always end with the hyperlink. 
However, Reuters' format for tweet is different, it sometimes contain an emoji after the hyperlink, 
therefore we have to break it down.

Extract the url from the tweet text
Doesn't work if the tweet text is a retweet, as it won't trace back to the original tweet. 
But can still collect countries mentioned in the text."""

def extractREU_URL(dataframe):
    #try and except for errors that can occur, such as when no links are mentioned in the tweet
    try:
        for index, row in dataframe.iterrows():
            text = dataframe.at[index,"tweet text"]
            extractor = URLExtract()
            url = extractor.find_urls(text)

            if len(url) > 1:
                dataframe.at[index,"links"] = url[0]
            else:
                dataframe.at[index,"links"] = str(url)
    except:
        pass
        
    return dataframe

In [None]:
extractURL(bbc_df)

In [None]:
extractURL(cnn_df)

In [None]:
extractREU_URL(reu_df)

In [None]:
cnn_df.to_csv("CNN Breaking News.csv") #save new csv file after every section

In [None]:
bbc_df.to_csv("BBC Breaking News.csv") #save new csv file after every section

In [None]:
reu_df.to_csv("Reuters.csv") #save new csv file after every section

### Extract countries from the text

**Part 1**: After collecting the text of the tweet using Twitter API, we have to find the countries mentioned in the tweet. For the first part, we only look at the geographic locations mentioned in the text. 

In [None]:
### Open Saved CSV Files
cnn_df = pd.read_csv("CNN Breaking News.csv")
bbc_df = pd.read_csv("BBC Breaking News.csv")
reu_df = pd.read_csv("Reuters.csv")

In [None]:
#use spacy libray, that uses natural language processing to find geographic location
import spacy
nlp = spacy.load("en_core_web_sm")

#### Example for spacy 
If we parse the text _"United States, Bethoven, Science, BBC, dogs, Bella"_ through nlp, 
it would return  
>    [('United States', 0, 13, 'GPE'),  
     ('Bethoven', 15, 23, 'GPE'),  
     ('Science', 25, 32, 'ORG'),  
     ('BBC', 34, 37, 'ORG'),  
     ('Bella', 45, 50, 'PERSON')]
     
The array includes the phrase, start_char, end_char, and label.

In [None]:
cnn_df["Countries from text"] = "" #create new column to populate later
bbc_df["Countries from text"] = ""
reu_df["Countries from text"] = ""

In [None]:
"""
This function takes in the dataframe, looks at the text collected previously, 
and return a new dataframe with places mentioned in the tweet text

It uses the library spacy that retrieve information of all the words, using natural language processing

"""
def find_places(dataframe):
    
    for index, row in dataframe.iterrows():
        
        text = dataframe.at[index,"tweet text"] #tweet text
        
        #try and except to handle error
        try:
            text = nlp(text)
            words = []
            
            #retrieve information of all the words in the text
            for ent in text.ents:
                ent_words = ent.text, ent.start_char, ent.end_char, ent.label_
                words.append(ent_words)
                
            final_places = []
            
            for word in words: 
                if word[3] == "GPE": #the third part gives the label (GPE) of the word/words
                    place = word[0] #the first part returns the actual word/words
                    
                    #clean the name of the place to a format that is recognizable for the next step
                    if "'" in place:
                        place = place.split("'")[0]
                    if "the" in place: #we don't want locations like "The United States"
                        place = place.split("the ")[-1]
                    if "province" in place: #we don't want locations like "Zhejiang province"
                        place = place.replace("province", "") 
                        
                    final_places.append(place) #add places into the final list

            dataframe.at[index,"Countries from text"] = final_places #populate column
            
        except:
            pass

    return dataframe

In [None]:
find_places(cnn_df).head()

In [None]:
find_places(bbc_df).head()

In [None]:
find_places(reu_df).head()

In [None]:
cnn_df.to_csv("CNN Breaking News.csv") #save new csv file after every section

In [None]:
bbc_df.to_csv("BBC Breaking News.csv") #save new csv file after every section

In [None]:
reu_df.to_csv("Reuters.csv") #save new csv file after every section

### Extract the news article text from the URL Links via web scraping for CNN
**Part 2**: Extract the text from the news article that was in a hyperlink from the Tweet. <br>
Since each news website have different format for their article, we have to inspect each of the website.

This doesn't work under a few circumstances: <br>
1. Tweets that are retweeted, we cannot trace back to the original link 
2. "Live Update" on CNN Breaking News as there are no hyperlink for these tweets

Step 1: Access the URL text via link using Beautiful Soup

In [None]:
from bs4 import BeautifulSoup as BS
import requests

In [None]:
def getHTMLPage(url):
    """Given a url, get the HTML page content"""
    
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print("Error: {}. Failure resaon: {}".format(response.status_code, 
                                                     response.reason))
        return

In [None]:
"""This functions takes in the url from CNN tweets and gets the text of the article from the CNN Website"""

def getURLText_CNN(url):
    
    page = getHTMLPage(url)
    if page == None:
        return ""
    
    domTree = BS(page, 'html.parser')
    first = domTree.find_all('p') #find first paragraph as it's under the format 'p'
    divs = domTree.find_all('div',{'class':'zn-body__paragraph'}) #find remaining paragraphs
    paragraphs = []
    
    #save only the first paragraph from all 'p'
    for fir in first:
        if "(CNN" in fir.text:
            paragraphs.append(fir.text) #only want the text

    for div in divs:
        paragraphs.append(div.text)
        
    concatenated_para = "".join(paragraphs)
    return concatenated_para
   

In [None]:
"""Use getURLText_CNN to find the concatenated paragraphs and save them to a text file"""

for index, row in cnn_df.iterrows():
    ID = cnn_df.at[index,"id"]
    directory = "/Users/jennychan/Desktop/CS 234 Final Project/CNN Breaking News/" #set file path
    name = "{}{}.txt".format(directory, ID) #save name as tweetid
    url = cnn_df.at[index,"links"]
    
    if url == None: #if there are no hyperlink in the dataframe
        with open(name, 'w') as f: 
            f.write("None")
    else:
        text = getURLText_CNN(url) #else, get the text from the hyperlink
        with open(name, 'w') as f: 
            f.write(text)

### Extract the news article text from the URL Links via web scraping for BBC
**Part 2**: Extract the text from the news article that was in a hyperlink from the Tweet. <br>
Since each news website have different format for their article, we have to inspect each of the website.

This doesn't work under a few circumstances: <br>
1. Tweets that are retweeted, we cannot trace back to the original link 
2. "Live Update" on BBC Breaking News as there are no hyperlink for these tweets

In [None]:
"""This functions takes in the url from BBC tweets and gets the text of the article from the BBC Website"""

def getURLText_BBC(url):
    
    page = getHTMLPage(url)
    if page == None:
        print("") #contains broken link
        
    try:
        domTree = BS(page, 'html.parser')
        ps = domTree.find_all('p',{'class':'ssrcss-1q0x1qg-Paragraph eq5iqo00'}) #under this class in 'p'
        paragraphs = []
        
        for p in ps:
            paragraphs.append(p.text) 
            
        concatenated_para = "".join(paragraphs) #joins all paragraph togehter
        
        return concatenated_para
    except:
        pass

In [None]:
"""Use getURLText_BBC to find the concatenated paragraphs and save them to a text file"""

for index in range(0,len(bbc_df)) : 
    ID = bbc_df.at[index,"id"]
    directory = "/Users/jennychan/Desktop/CS 234 Final Project/BBC Breaking News/" #set file path
    name = "{}{}.txt".format(directory, ID)
    url = bbc_df.at[index,"links"]
    
    if pd.isna(url): #if there are no hyperlink in the dataframe
        with open(name, 'w') as f: 
            f.write("None")
    else:
        try:
            text = getURLText_BBC(url) #else, get the text from the hyperlink
            with open(name, 'w') as f: 
                f.write(text)
        except:
            pass

### Extract the news article text from the URL Links via web scraping for Reuters
**Part 2**: Extract the text from the news article that was in a hyperlink from the Tweet. <br>
Since each news website have different format for their article, we have to inspect each of the website.

This doesn't work under a few circumstances: <br>
1. Tweets that are retweeted, we cannot trace back to the original link 

In [None]:
def getURLText_Reu(url):
    
    page = getHTMLPage(url)
    
    if page == None: #broken link
        print("")
    try:
        domTree = BS(page, 'html.parser')
        
        #paragraphs are under this class under 'p'
        ps = domTree.find_all('p',{'class':'Text__text___3eVx1j Text__dark-grey___AS2I_p Text__regular___Bh17t- Text__large___1i0u1F Body__base___25kqPt Body__large_body___3g04wK ArticleBody__element___3UrnEs'})
        paragraphs = []
        for p in ps:
            paragraphs.append(p.text)
        concatenated_para = " ".join(paragraphs)
        return concatenated_para
    
    except:
        pass

In [None]:
"""Use getURLText_Reu to find the concatenated paragraphs and save them to a text file"""

for index in range(0,len(reu_df)) : 
    ID = reu_df.at[index,"id"]
    directory = "/Users/jennychan/Desktop/CS 234 Final Project/Reuters/" #set file path
    name = "{}{}.txt".format(directory, ID)
    url = reu_df.at[index,"links"]
    
    if pd.isna(url): #if there are no hyperlink in the dataframe
        with open(name, 'w') as f: 
            f.write("None")
    else:
        try:
            text = getURLText_Reu(url) #else, get the text from the hyperlink
            with open(name, 'w') as f: 
                f.write(text)
        except:
            pass


### Extract country/countries mentioned from the news article in the save text file
If the tweet didn't mention any countries in the text, we check for the countries mentioned in the news article that we have saved. People would tend tend to look at the tweet itself first, and react based on it. Therefore, we would save the countries mentioned in the tweet and wouldn't look into the article if there are already countries mentioned in the tweet. Otherwise, we look at the article and look for countries mentioned. Sometimes, the tweet might be also about a country/place, but the nlp didn't pick it up, so we would look into the article for further context

In [None]:
from collections import Counter

In [None]:
# Create new columns for final countries
cnn_df["Final Countries"] = ""
bbc_df["Final Countries"] = ""
reu_df["Final Countries"] = ""

In [None]:
"""
This function takes in all the countries mentioned (can repeat) in the text that was stored in a list, 
it uses to return a list that contains the two countries that are most frequently in the text.

"""
def top_two_places(all_countries):
    two = []
    count = Counter(all_countries)
    top_countries = count.most_common(2) #two most frequently mentioned countries, in a counter format
    
    for country in top_countries: 
        two.append(country[0])
    return two #return a list

In [None]:
from geopy.geocoders import Nominatim

In [None]:
"""
This function uses the library geopy.
It takes in a city/state/location mentioned and returns the country of the location

"""
def country_from_city(place):
    
    try:
        geolocator = Nominatim(user_agent = "geoapiExercises") #initiate the geolocator that looks for the country using the 'geoapi'
        location = geolocator.geocode(place, language="en") #language is english, or else it returns country names in their local language
        full_address = location.address #gives the full address of the place (including city, states, provinces etc)
        final_country = full_address.split(",")[-1].strip() #only want the last part, which is the country
        return final_country
    
    except AttributeError:
        pass
    
    except:
        pass

In [None]:
"""Extract Countries for all tweet based on the text or from the article in the hyperlink"""

directory = "/Users/jennychan/Desktop/CS 234 Final Project/CNN Breaking News/"

for index in range(0,len(cnn_df)) :
    tweet_countries = cnn_df.at[index,"Countries from text"]
    
    if len(tweet_countries) != 0: # if tweet text contains countries already
        countries_mentioned = []
        for place in tweet_countries:
            country = country_from_city(place)
    
            if country not in countries_mentioned: #don't double add
                countries_mentioned.append(country)
            else:
                pass
                    
        bbc_df.at[index,"Final Countries"] = countries_mentioned
    
    else: #if they didn't mention countries in the text, look into the txt file
        tweetID = df.at[index,"id"]
        with open('{}{}.txt'.format(directory, tweetID)) as f: #open the article
            lines = f.read()
            all_places = find_places(lines)
            top2 = top_two_places(all_places)
            countries_mentioned = []

            if len(top2) != 0:
                for place in top2:
                    if place == "NoneType": # no countries mentioned
                        pass
                    country = country_from_city(place)

                    if country not in countries_mentioned: #don't double add
                        countries_mentioned.append(country)
                    else:
                        pass

        df.at[index,"Final Countries"] = countries_mentioned

In [None]:
"""Same as above, but for BBC
Extract Countries for all tweet based on the text or from the article in the hyperlink or the tweet text"""

bbc_directory = "/Users/jennychan/Desktop/CS 234 Final Project/BBC Breaking News/"

for index in range(0,len(bbc_df)) :
    tweet_countries = bbc_df.at[index,"Countries from text"]
    
    if len(tweet_countries) != 0: # if tweet text contains countries already
        countries_mentioned = []
        for place in tweet_countries:
            country = country_from_city(place)
    
            if country not in countries_mentioned: #don't double add
                countries_mentioned.append(country)
            else:
                pass
                    
        bbc_df.at[index,"Final Countries"] = countries_mentioned
    
    else: #if they didn't mention countries in the text, look into the txt file
        tweetID = bbc_df.at[index,"id"]
        with open('{}{}.txt'.format(bbc_directory, tweetID)) as f:
            lines = f.read()
            all_places = find_places(bbc_df)
            top2 = top_two_places(all_places)
            countries_mentioned = []
            
            if len(top2) != 0:
                for place in top2:
                    if place == "NoneType": # no countries mentioned
                        pass
                    country = country_from_city(place)

                    if country not in countries_mentioned: #don't double add
                        countries_mentioned.append(country)
                    else:
                        pass

        bbc_df.at[index,"Final Countries"] = countries_mentioned

In [None]:
"""Same as above, but for Reuter.
Extract Countries for all tweet based on the text or from the article in the hyperlink or the tweet text"""

directory =  "/Users/jennychan/Desktop/CS 234 Final Project/Reuters/"

for index in range(0,len(reu_df)) :
    tweet_countries = reu_df.at[index,"Countries from text"]
    
    if pd.isna(tweet_countries): #if no countries
        tweet_countries = "[]"
    
    #tweet_countries = tweet_countries.replace("[","").replace("]","")
  #  lis = tweet_countries.split("',")
  #  final_countries_list = []
  #  for i in lis:
   #     i = i.strip().replace("'","")
     #   final_countries_list.append(i)
    
    if len(final_countries_list) != 0: # if tweet text contains countries already
        countries_mentioned = []
        for place in final_countries_list:
            country = country_from_city(place)
    
            if country not in countries_mentioned: #don't double add
                countries_mentioned.append(country)
            else:
                pass
            
        reu_df.at[index,"Final Countries"] = countries_mentioned
    
    else: #if they didn't mention countries in the text, open txt file
        tweetID = bbc_df.at[index,"id"]
        with open('{}{}.txt'.format(bbc_directory, tweetID)) as f:
            lines = f.read()
            all_places = find_places(bbc_df)
            top2 = top_two_places(all_places)
            countries_mentioned = []
            
            if len(top2) != 0:
                for place in top2:
                    if place == "NoneType":
                        pass
                    
                    country = country_from_city(place)

                    if country not in countries_mentioned: #don't double add
                        countries_mentioned.append(country)
                    else:
                        pass

        reu_df.at[index,"Final Countries"] = countries_mentioned

In [None]:
cnn_df.to_csv("CNN Breaking News.csv") #save df to csv

In [None]:
bbc_df.to_csv("BBC Breaking News.csv") #save df to csv

In [None]:
reu_df.to_csv("Reuters.csv") #save df to csv

## Extract the income level of the countries
This part uses the world bank wbdata library based on the World Bank Data. It takes in the country and finds the income level according to the World Bank's metrics on GNI per capita of the country. 

In [None]:
import wbdata 

In [None]:
"""This function takes in the dataframe and finds all the unique countries mentioned in the 'Final Countries' column"""

def countries_set(dataframe):
    
    countries_set = set()
    for index in range(0, len(dataframe)):
        final_countries = dataframe.at[index,"Final Countries"]
        
        #final_countries = final_countries.replace("[","").replace("]","")
        #lis = final_countries.split("',")
        #final_countries_list = []
        #for i in lis:
         #   i = i.strip().replace("'","")
          #  final_countries_list.append(i)

        #dataframe.at[index,"Final Countries"] = final_countries_list

        for country in final_countries:
            countries_set.add(country)      
   # countries_set.remove()
    return countries_set

In [None]:
cnn_set = countries_set(cnn_df)
cnn_set.remove('')
country_cnn = pd.DataFrame() #populate to new dataframe
len(cnn_set)

In [None]:
bbc_set = countries_set(bbc_df)
country_bbc = pd.DataFrame()
len(bbc_set)

In [None]:
reu_set = countries_set(reu_df)
len(reu_set)

In [49]:
"""
This function takes in the set of unique countries and the original dataframe from Twitter. 
For each country in the unique countries set, 
it finds the average number of responses of each post that mentioned the country, 
and returns a dictionary that has the country as the key, and the mean number as the value. 
"""
def countries_reactions(countries_set, dataframe):
    dic = dict()
    for country in countries_set: 
        responses = 0
        num_tweets = 0
        for index in range(0,len(dataframe)):
            countries = dataframe.at[index,"Final Countries"]
            reactions = dataframe.at[index,"# Reactions"]

            for country_mentioned in countries:
                if country_mentioned == country:
                    num_tweets += 1 #count the number of tweets that include this country
                    responses += reactions #add the reactions to the total number of responses
                    
        dic[country] = round(responses/num_tweets,0) #find the mean
        
    return dic

In [50]:
"""
The function takes in three parameters:
1. countries_set - the set of unique countries from the dataframe
2. df_name - create a new dataframe named df_name
3. in_dataframe - the original dataframe that contains information for twitter (use to run the countries_reaction function)
"""

def countries_level(countries_set, df_name, in_dataframe):
    coun_dic = countries_reactions(countries_set,in_dataframe) #run the function countries_reactions that returns a dictionary
    df_name = pd.DataFrame(coun_dic.items(), columns=['Countries','# of Reactions']) # populate the dataframe with the dictionary
    
    for index in range(0,len(df_name)): #itterate through all countries in the dataframe
        country = df_name.at[index, "Countries"]

        try:
            country_income = wbdata.search_countries(country)[0]["incomeLevel"]["value"] #extraact the country income level
            df_name.at[index,"Income Level"] = country_income

        except:
            pass

            
    return df_name

In [None]:
countries_level(reu_set, country_reu, reu_df)

In [51]:
country_cnn = countries_level(cnn_set,country_cnn,cnn_df)

In [None]:
country_bbc = countries_level(bbc_set, country_bbc, bbc_df)

In [None]:
country_reu.to_csv("Reuters Countries.csv") #save to a new dataframe use for other parts

In [None]:
country_bbc.to_csv("BBC Countries.csv") #save to a new dataframe use for other parts

In [52]:
country_cnn.to_csv("CNN Countries.csv") #save to a new dataframe use for other parts