# Merge Tweets, News API, and MN511 datasets

# Import Libraries

In [52]:
import pandas as pd

# Load csv

In [53]:
tweets = pd.read_csv('./data/twitter_mn_official.csv')
mn511 = pd.read_csv('./data/mn_511.csv')
newsapi = pd.read_csv('./data/news_api.csv')

# Inspect DataFrames

### Twitter

In [54]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,date,geo,hashtags,text,type,username
0,0,2019-04-12 12:40:24+00:00,,,Crash at Hwy 12/25 south of Buffalo. Traffic i...,official,MnDOTcentral
1,1,2019-04-12 00:05:20+00:00,,,"WB I-94, west of Sauk Centre- closed due to du...",official,MnDOTcentral
2,2,2019-04-11 22:30:44+00:00,,,No travel advised on Hwy 169 along the west si...,official,MnDOTcentral
3,3,2019-04-11 22:17:09+00:00,,#Stearns #centralMN,No Travel Advised has been expanded to include...,official,MnDOTcentral
4,4,2019-04-11 19:49:47+00:00,,,No Travel Advisory issued for I-94 through @St...,official,MnDOTcentral


In [55]:
tweets.drop(['geo', 'Unnamed: 0', 'type'], axis=1, inplace=True)
tweets.head()

Unnamed: 0,date,hashtags,text,username
0,2019-04-12 12:40:24+00:00,,Crash at Hwy 12/25 south of Buffalo. Traffic i...,MnDOTcentral
1,2019-04-12 00:05:20+00:00,,"WB I-94, west of Sauk Centre- closed due to du...",MnDOTcentral
2,2019-04-11 22:30:44+00:00,,No travel advised on Hwy 169 along the west si...,MnDOTcentral
3,2019-04-11 22:17:09+00:00,#Stearns #centralMN,No Travel Advised has been expanded to include...,MnDOTcentral
4,2019-04-11 19:49:47+00:00,,No Travel Advisory issued for I-94 through @St...,MnDOTcentral


### MN511

In [56]:
mn511.head()

Unnamed: 0.1,Unnamed: 0,Header,Text,locations_and_times
0,0,MN 19: Flooding.,Between MN 93; North 5th Street and US 169 (ne...,"[('North 5th Street', 'GPE'), ('Henderson', 'G..."
1,1,MN 93: Flooding.,Between US 169 (Le Sueur) and MN 19; North 5th...,"[('US', 'GPE')]"
2,2,MN 95: Road closed.,At Fern Street North (Cambridge). The road is ...,[]
3,3,US 75 in both directions: Flooding.,Between 220th Avenue and 230th Avenue (Halstad...,[]
4,4,MN 1: Flooding.,Between North Dakota State Line (Oslo) and 470...,"[('North Dakota', 'GPE'), ('Oslo', 'GPE'), ('A..."


In [57]:
mn511.drop(['Unnamed: 0'], axis=1, inplace=True)

In [58]:
mn511['source'] = 'mn511'

In [59]:
mn511.head()

Unnamed: 0,Header,Text,locations_and_times,source
0,MN 19: Flooding.,Between MN 93; North 5th Street and US 169 (ne...,"[('North 5th Street', 'GPE'), ('Henderson', 'G...",mn511
1,MN 93: Flooding.,Between US 169 (Le Sueur) and MN 19; North 5th...,"[('US', 'GPE')]",mn511
2,MN 95: Road closed.,At Fern Street North (Cambridge). The road is ...,[],mn511
3,US 75 in both directions: Flooding.,Between 220th Avenue and 230th Avenue (Halstad...,[],mn511
4,MN 1: Flooding.,Between North Dakota State Line (Oslo) and 470...,"[('North Dakota', 'GPE'), ('Oslo', 'GPE'), ('A...",mn511


### News API

In [60]:
newsapi.head()

Unnamed: 0.1,Unnamed: 0,publishedAt,content,url,locations_and_times
0,0,2019-04-04T11:48:00Z,Spaceflight company SpaceX test-fired the engi...,https://gizmodo.com/spacex-successfully-test-f...,"[('Wednesday', 'DATE'), ('Boca Chica', 'GPE'),..."
1,1,2019-03-31T12:12:35Z,A forest fire in Burlington County has grown t...,https://news.yahoo.com/forest-fire-continues-b...,"[('Burlington County', 'GPE')]"
2,2,2019-04-07T06:36:00Z,The ban on civilian traffic on the national hi...,https://www.hindustantimes.com/india-news/two-...,"[('Jammu', 'GPE'), ('Srinagar', 'GPE'), ('toda..."
3,3,2019-04-06T22:24:00Z,Electricity has been mostly restored on Vancou...,https://www.timescolonist.com/news/local/power...,"[('Vancouver Island', 'GPE')]"
4,4,2019-04-07T20:06:52Z,Officials at Kruger National Park in South Afr...,https://www.npr.org/2019/04/07/710840965/suspe...,"[('South Africa', 'GPE'), ('2016', 'DATE')]"


In [61]:
newsapi.drop(['Unnamed: 0', 'url'], axis=1, inplace=True)
newsapi.head()

Unnamed: 0,publishedAt,content,locations_and_times
0,2019-04-04T11:48:00Z,Spaceflight company SpaceX test-fired the engi...,"[('Wednesday', 'DATE'), ('Boca Chica', 'GPE'),..."
1,2019-03-31T12:12:35Z,A forest fire in Burlington County has grown t...,"[('Burlington County', 'GPE')]"
2,2019-04-07T06:36:00Z,The ban on civilian traffic on the national hi...,"[('Jammu', 'GPE'), ('Srinagar', 'GPE'), ('toda..."
3,2019-04-06T22:24:00Z,Electricity has been mostly restored on Vancou...,"[('Vancouver Island', 'GPE')]"
4,2019-04-07T20:06:52Z,Officials at Kruger National Park in South Afr...,"[('South Africa', 'GPE'), ('2016', 'DATE')]"


# Align all columns

### Twitter

In [62]:
tweets.columns

Index(['date', 'hashtags', 'text', 'username'], dtype='object')

In [63]:
new_columns_dict = {
        'date':'date',
        'hashtags': 'keywords',
        'text': 'text',
        'username': 'source'
    }

tweets.rename(columns=new_columns_dict, inplace=True)
tweets.head(1)

Unnamed: 0,date,keywords,text,source
0,2019-04-12 12:40:24+00:00,,Crash at Hwy 12/25 south of Buffalo. Traffic i...,MnDOTcentral


In [64]:
tweets['location'] = 'NA'
tweets['source'] = 'twitter'


In [65]:
tweets = tweets[['date','text', 'keywords', 'location', 'source']]
tweets.head()

Unnamed: 0,date,text,keywords,location,source
0,2019-04-12 12:40:24+00:00,Crash at Hwy 12/25 south of Buffalo. Traffic i...,,,twitter
1,2019-04-12 00:05:20+00:00,"WB I-94, west of Sauk Centre- closed due to du...",,,twitter
2,2019-04-11 22:30:44+00:00,No travel advised on Hwy 169 along the west si...,,,twitter
3,2019-04-11 22:17:09+00:00,No Travel Advised has been expanded to include...,#Stearns #centralMN,,twitter
4,2019-04-11 19:49:47+00:00,No Travel Advisory issued for I-94 through @St...,,,twitter


### MN 511

In [66]:
mn511.columns

Index(['Header', 'Text', 'locations_and_times', 'source'], dtype='object')

In [67]:
new_columns_dict = {
        'Header':'keywords',
        'Text': 'text',
        'source': 'source',
        'locations_and_times': 'location'
}

mn511.rename(columns=new_columns_dict, inplace=True)
mn511.head(1)

Unnamed: 0,keywords,text,location,source
0,MN 19: Flooding.,Between MN 93; North 5th Street and US 169 (ne...,"[('North 5th Street', 'GPE'), ('Henderson', 'G...",mn511


In [68]:
mn511['date'] = 'date'


In [69]:
mn511 = mn511[['date','text', 'keywords', 'location', 'source']]
mn511.head()

Unnamed: 0,date,text,keywords,location,source
0,date,Between MN 93; North 5th Street and US 169 (ne...,MN 19: Flooding.,"[('North 5th Street', 'GPE'), ('Henderson', 'G...",mn511
1,date,Between US 169 (Le Sueur) and MN 19; North 5th...,MN 93: Flooding.,"[('US', 'GPE')]",mn511
2,date,At Fern Street North (Cambridge). The road is ...,MN 95: Road closed.,[],mn511
3,date,Between 220th Avenue and 230th Avenue (Halstad...,US 75 in both directions: Flooding.,[],mn511
4,date,Between North Dakota State Line (Oslo) and 470...,MN 1: Flooding.,"[('North Dakota', 'GPE'), ('Oslo', 'GPE'), ('A...",mn511


### News API

In [70]:
newsapi.columns

Index(['publishedAt', 'content', 'locations_and_times'], dtype='object')

In [71]:
new_columns_dict = {
        'publishedAt':'date',
        'content': 'text',
        'locations_and_times': 'location'    
}

newsapi.rename(columns=new_columns_dict, inplace=True)
newsapi.head(1)

Unnamed: 0,date,text,location
0,2019-04-04T11:48:00Z,Spaceflight company SpaceX test-fired the engi...,"[('Wednesday', 'DATE'), ('Boca Chica', 'GPE'),..."


In [72]:
newsapi['source'] = 'newsapi'

In [73]:
newsapi['location'] = 'NA'
newsapi['keywords'] = 'NA'

In [74]:
newsapi.head()

Unnamed: 0,date,text,location,source,keywords
0,2019-04-04T11:48:00Z,Spaceflight company SpaceX test-fired the engi...,,newsapi,
1,2019-03-31T12:12:35Z,A forest fire in Burlington County has grown t...,,newsapi,
2,2019-04-07T06:36:00Z,The ban on civilian traffic on the national hi...,,newsapi,
3,2019-04-06T22:24:00Z,Electricity has been mostly restored on Vancou...,,newsapi,
4,2019-04-07T20:06:52Z,Officials at Kruger National Park in South Afr...,,newsapi,


In [75]:
newsapi.columns

Index(['date', 'text', 'location', 'source', 'keywords'], dtype='object')

In [76]:
newsapi = newsapi[['date','text', 'keywords', 'location', 'source']]
newsapi.head()

Unnamed: 0,date,text,keywords,location,source
0,2019-04-04T11:48:00Z,Spaceflight company SpaceX test-fired the engi...,,,newsapi
1,2019-03-31T12:12:35Z,A forest fire in Burlington County has grown t...,,,newsapi
2,2019-04-07T06:36:00Z,The ban on civilian traffic on the national hi...,,,newsapi
3,2019-04-06T22:24:00Z,Electricity has been mostly restored on Vancou...,,,newsapi
4,2019-04-07T20:06:52Z,Officials at Kruger National Park in South Afr...,,,newsapi


# Merge all DataFrames

In [77]:
df = pd.concat([tweets, mn511, newsapi], axis=0)

In [78]:
df.shape

(14233, 5)

In [79]:
df.head()

Unnamed: 0,date,text,keywords,location,source
0,2019-04-12 12:40:24+00:00,Crash at Hwy 12/25 south of Buffalo. Traffic i...,,,twitter
1,2019-04-12 00:05:20+00:00,"WB I-94, west of Sauk Centre- closed due to du...",,,twitter
2,2019-04-11 22:30:44+00:00,No travel advised on Hwy 169 along the west si...,,,twitter
3,2019-04-11 22:17:09+00:00,No Travel Advised has been expanded to include...,#Stearns #centralMN,,twitter
4,2019-04-11 19:49:47+00:00,No Travel Advisory issued for I-94 through @St...,,,twitter


In [80]:
#df.to_csv('./data/alldata_update.csv')