### Collect data for mapping

In [1]:
import datetime as dt
import pandas as pd
import re
from twitterscraper import query_tweets
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
# Get tweets basing off of keywords
def get_tweets(list_of_keywords, limit, begindate):
    tweets_list = []
    
    for keyword in list_of_keywords:
        tweets = query_tweets(keyword, limit=limit, begindate=begindate, lang='en')
        
        for tweet in tweets:
            tweets_dict = {'user': tweet.username, 'text': tweet.text, 'time': tweet.timestamp}
            
            tweets_list.append(tweets_dict)
    
    return tweets_list

In [3]:
tweets_list = get_tweets(
    ['#applefire', 'hogfire', 'goldfire', 'wildfire near:california'], # those words are related to wildfires in California
    limit=100,
    begindate=dt.date(2020, 8,1)
)

In [4]:
tweets_df = pd.DataFrame(tweets_list)

In [173]:
tweets_df

Unnamed: 0,user,text,time
0,The Desert Way,Glad we attended Pioneertown's meeting on 8/6....,2020-08-08 23:43:05
1,Warning Watch | San Diego East County,#RT @NWSSanDiego: RT @SouthCoastAQMD: SMOKE AD...,2020-08-08 23:39:18
2,tayagrayphoto,Today marks my one-year anniversary of full ti...,2020-08-08 23:27:10
3,South Coast AQMD,SMOKE ADVISORY UPDATE: #AppleFire Advisory ext...,2020-08-08 23:18:16
4,Rob ®,This @MyDesert photo of the #AppleFire as seen...,2020-08-08 22:54:08
...,...,...,...
853,High Desert People,Shot from Apple Valley. Smoke covers an almost...,2020-08-03 03:18:52
854,Laurian Green,The desert is on fire. #wildfire #california #...,2020-08-03 02:27:28
855,Lioness pride | 🏳️‍🌈 ✊🏿✊🏾✊🏽✊🏼✊🏻,"Aw heck, be safe. \nWildfires/brush fires in g...",2020-08-03 01:16:42
856,IzzySkr33min,Thankful for these brave fire fighters who def...,2020-08-03 00:28:43


In [174]:
tweets_df.drop_duplicates(inplace=True)

In [175]:
tweets_df

Unnamed: 0,user,text,time
0,The Desert Way,Glad we attended Pioneertown's meeting on 8/6....,2020-08-08 23:43:05
1,Warning Watch | San Diego East County,#RT @NWSSanDiego: RT @SouthCoastAQMD: SMOKE AD...,2020-08-08 23:39:18
2,tayagrayphoto,Today marks my one-year anniversary of full ti...,2020-08-08 23:27:10
3,South Coast AQMD,SMOKE ADVISORY UPDATE: #AppleFire Advisory ext...,2020-08-08 23:18:16
4,Rob ®,This @MyDesert photo of the #AppleFire as seen...,2020-08-08 22:54:08
...,...,...,...
853,High Desert People,Shot from Apple Valley. Smoke covers an almost...,2020-08-03 03:18:52
854,Laurian Green,The desert is on fire. #wildfire #california #...,2020-08-03 02:27:28
855,Lioness pride | 🏳️‍🌈 ✊🏿✊🏾✊🏽✊🏼✊🏻,"Aw heck, be safe. \nWildfires/brush fires in g...",2020-08-03 01:16:42
856,IzzySkr33min,Thankful for these brave fire fighters who def...,2020-08-03 00:28:43


In [176]:
tweets_df.to_csv('../data/tweets_for_mapping_raw.csv', index=False)

#### We couldn't get geo locational data using twitterscraper and use Twitter Api. We decided to extract locational data looking at tweets. If a tweet in tweets_for_mapping_raw.csv has a clear location information, we add latitude and longitude data in the file using google spread sheet.

In [19]:
with_location = pd.read_csv('../data/tweets_for_mapping_with_location.csv')

In [20]:
with_location

Unnamed: 0,user,text,time,latitude,longitude,label
0,The Desert Way,Glad we attended Pioneertown's meeting on 8/6....,2020-08-08 23:43:05,33.331967,-114.990289,
1,Warning Watch | San Diego East County,#RT @NWSSanDiego: RT @SouthCoastAQMD: SMOKE AD...,2020-08-08 23:39:18,,,
2,tayagrayphoto,Today marks my one-year anniversary of full ti...,2020-08-08 23:27:10,,,
3,South Coast AQMD,SMOKE ADVISORY UPDATE: #AppleFire Advisory ext...,2020-08-08 23:18:16,,,
4,Rob ®,This @MyDesert photo of the #AppleFire as seen...,2020-08-08 22:54:08,33.771839,-116.704031,
...,...,...,...,...,...,...
590,High Desert People,Shot from Apple Valley. Smoke covers an almost...,2020-08-03 3:18:52,33.979649,-116.987006,
591,Laurian Green,The desert is on fire. #wildfire #california #...,2020-08-03 2:27:28,,,
592,Lioness pride | 🏳️‍🌈 ✊🏿✊🏾✊🏽✊🏼✊🏻,"Aw heck, be safe. \nWildfires/brush fires in g...",2020-08-03 1:16:42,,,
593,IzzySkr33min,Thankful for these brave fire fighters who def...,2020-08-03 0:28:43,,,


#### Most of the texts didn't have locational information. For mapping, we use only data having the information.

In [21]:
with_location.drop(columns=['user', 'label'], inplace=True)

In [22]:
with_location.dropna(inplace=True)

In [23]:
with_location.head()

Unnamed: 0,text,time,latitude,longitude
0,Glad we attended Pioneertown's meeting on 8/6....,2020-08-08 23:43:05,33.331967,-114.990289
4,This @MyDesert photo of the #AppleFire as seen...,2020-08-08 22:54:08,33.771839,-116.704031
13,“As the Apple Fire continued to rage this week...,2020-08-08 20:55:21,33.948464,-116.814601
14,"#AppleFire near Baumont consumes more than 4,0...",2020-08-01 23:59:35,33.671772,-117.202858
15,#AppleFire: Voluntary evacuations are now in p...,2020-08-01 23:59:26,34.049474,-116.956559


In [24]:
with_location['time'] = pd.to_datetime(with_location['time'])

In [25]:
with_location['time'] = with_location['time'].apply(lambda x: x.date())

In [26]:
with_location.head()

Unnamed: 0,text,time,latitude,longitude
0,Glad we attended Pioneertown's meeting on 8/6....,2020-08-08,33.331967,-114.990289
4,This @MyDesert photo of the #AppleFire as seen...,2020-08-08,33.771839,-116.704031
13,“As the Apple Fire continued to rage this week...,2020-08-08,33.948464,-116.814601
14,"#AppleFire near Baumont consumes more than 4,0...",2020-08-01,33.671772,-117.202858
15,#AppleFire: Voluntary evacuations are now in p...,2020-08-01,34.049474,-116.956559


### Cleaning

In [27]:
# Remove URL
def remove_url(text):
    new_text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    return new_text

# Remove HTML tags
def remove_html(text):
    soup = BeautifulSoup(text)
    
    return soup.get_text()

# Remove except for words
# if you wanna keep specific things, use regular expressions
def keep_only_words(text):
    new_text = re.sub('[^a-zA-Z]', ' ', text)
    
    return new_text

# Lower cases
def lower_words(text):
    new_text = text.lower()
    
    return new_text

# Remove specific locations
def remove_specific_locations(text):
    # Create a list of sentense with split words.
    split_text = text.split()
    
    # Locations you want to remove
    locations = ['canada', 'tokyo']
    
    # Remove locations from a text
    new_text = [word for word in split_text if word not in locations]
    
    # The text is still a list of words because it is convinient for next process
    return new_text

# Remove stop words

# If you haven't download
#nltk.download('stopwords')

def remove_stopwords(text):
    new_text = [word for word in text if word not in stopwords.words('english')]
    
    # The text is still a list of words because it is convinient for next process
    return new_text

# Stem words
def stemming(text):
    p_stemmer = PorterStemmer()
    stemmed = [p_stemmer.stem(i) for i in text]
    
    return ' '.join(stemmed)

# Text cleaning
def clean_text(text):
    url_removed = remove_url(text)
    
    html_removed = remove_html(url_removed)
    
    words_only = keep_only_words(html_removed)
    
    lowered = lower_words(words_only)
    
    location_removed = remove_specific_locations(lowered)
    
    stopwords_removed = remove_stopwords(location_removed)
    
    stemmed = stemming(stopwords_removed)
    
    return stemmed

In [28]:
with_location['text'] = with_location['text'].apply(lambda x: clean_text(x))

In [30]:
with_location.head()

Unnamed: 0,text,time,latitude,longitude
0,glad attend pioneertown meet abt peopl gather ...,2020-08-08,33.331967,-114.990289
4,mydesert photo applefir seen palmspr int l air...,2020-08-08,33.771839,-116.704031
13,appl fire continu rage week spread mile remot ...,2020-08-08,33.948464,-116.814601
14,applefir near baumont consum acr hotanddri sum...,2020-08-01,33.671772,-117.202858
15,applefir voluntari evacu place oak glen forest...,2020-08-01,34.049474,-116.956559


In [32]:
with_location.to_csv('../data/mapping_for_app.csv', index=False)