**Note:** place file one level up in directory structure so it can run properly.

# Analyzing Baseball Fan Sentiment with Twitter Data (Bluejays) #

In [7]:
import glob
import pandas as pd
import numpy as np

# For working with JSON data.
try:
    import json
except ImportError:
    import simplejson as json
    
# For maps.
from mpl_toolkits.basemap import Basemap
import geocoder
import matplotlib.pyplot as plt
%matplotlib inline

#import warnings
#warnings.simplefilter(action = "ignore", category = FutureWarning)

## Intial Exploration (10,000 Tweets Version) ##

In [8]:
for filename in glob.iglob('data/bluejays/*.txt'):
     print(filename)

data/bluejays/bluejays_search_08_07_1000.txt
data/bluejays/bluejays_search_08_08_1000.txt
data/bluejays/bluejays_search_08_09_1000.txt
data/bluejays/bluejays_search_08_10_1000.txt
data/bluejays/bluejays_search_08_11_1000.txt
data/bluejays/bluejays_search_08_12_1000.txt
data/bluejays/bluejays_search_08_13_1000.txt
data/bluejays/bluejays_search_08_14_1000.txt
data/bluejays/bluejays_search_08_15_1000.txt
data/bluejays/bluejays_search_08_16_1000.txt


In [9]:
# Read in JSON data and store in pandas DataFrame.
tweets = []
locations = []
for filename in glob.iglob('data/bluejays/*.txt'):
    tweets_filename = filename
    tweets_file = open(tweets_filename, "r")
    for line in tweets_file:
        try:
            # Read in each line of file, convert to JSON object. 
            tweet = json.loads(line.strip())
            # Make sure tweet has text content.
            if 'text' in tweet:
                tweets.append(tweet)
                locations.append(tweet['user']['location'])
        except:
            # Skip any non-JSON-formatted data that may have been captured.
            continue

In [10]:
df_bluejays_all = pd.DataFrame(tweets)

In [11]:
df_bluejays_all['location'] = pd.Series(locations)

In [12]:
df_bluejays_all.shape

(10000, 31)

In [13]:
df_bluejays_all.head(3)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,id_str,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user,location
0,,,Sun Aug 07 23:59:53 +0000 2016,"{'urls': [], 'hashtags': [{'indices': [88, 98]...",,0,False,,762438141730959365,762438141730959365,...,,,0,False,,"<a href=""http://twitter.com/download/iphone"" r...","@bluejays winning record on the road trip, com...",False,"{'profile_use_background_image': True, 'time_z...",
1,,,Sun Aug 07 23:59:37 +0000 2016,"{'urls': [{'indices': [78, 101], 'display_url'...","{'media': [{'sizes': {'large': {'w': 1116, 'h'...",1,False,,762438076480102402,762438076480102402,...,,,0,False,,"<a href=""http://dlvr.it"" rel=""nofollow"">dlvr.i...",L #Toronto #BlueJays Toronto Blue Jays Beer St...,False,"{'profile_use_background_image': True, 'time_z...",
2,,,Sun Aug 07 23:59:18 +0000 2016,"{'urls': [{'indices': [72, 95], 'display_url':...","{'media': [{'sizes': {'large': {'w': 400, 'h':...",0,False,,762437995639115777,762437995639115777,...,,,0,False,,"<a href=""http://dlvr.it"" rel=""nofollow"">dlvr.i...",#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,False,"{'profile_use_background_image': True, 'time_z...",


In [190]:
#df_redsox_all['retweet_count'].value_counts()

In [14]:
df_bluejays = df_bluejays_all[['text', 'created_at', 'location', 'retweet_count', 'favorite_count']]

In [15]:
df_bluejays.shape

(10000, 5)

In [16]:
df_bluejays.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count
0,"@bluejays winning record on the road trip, com...",Sun Aug 07 23:59:53 +0000 2016,,0,0
1,L #Toronto #BlueJays Toronto Blue Jays Beer St...,Sun Aug 07 23:59:37 +0000 2016,,0,1
2,#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,Sun Aug 07 23:59:18 +0000 2016,,0,0
3,#MLB #Baseball TORONTO #BlueJays 1992 WORLD SE...,Sun Aug 07 23:59:16 +0000 2016,,0,0
4,RT @Wilnerness590: #Bluejays 4-3 road trip end...,Sun Aug 07 23:59:01 +0000 2016,,3,0
5,#BlueJays return home with a winning road reco...,Sun Aug 07 23:58:50 +0000 2016,,15,9
6,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:39 +0000 2016,"Gaming place, GAMING COUNTRY.",10,0
7,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:35 +0000 2016,,10,0
8,RT @JoseMeowtista: Tabby said the #BlueJays ha...,Sun Aug 07 23:57:02 +0000 2016,,11,0
9,#SS #TroyTulowitzki 50 #Card lot with no dupli...,Sun Aug 07 23:55:12 +0000 2016,,0,0


In [94]:
# DONT USE
lats = []
longs = []
for location in df_redsox['location']:
    g = geocoder.arcgis(location)
    if g.latlng != []:
        lats.append(g.latlng[0])
        longs.append(g.latlng[1])
    else:
        lats.append(np.nan)
        longs.append(np.nan)

In [17]:
unique_locations = df_bluejays['location'].unique()

In [18]:
len(unique_locations)
#unique_locations = unique_locations[:10]
#unique_locations

1682

In [19]:
# USE THIS
location_coords = {}
for location in unique_locations:
    location_coords[location] = geocoder.arcgis(location)

In [20]:
location_coords[''].latlng

[]

In [21]:
def get_lat(x):
    if x.latlng != []:
        return(x.latlng[0])
    else:
        return(np.nan)

In [22]:
def get_long(x):
    if x.latlng != []:
        return(x.latlng[1])
    else:
        return(np.nan)

In [23]:
get_lat(location_coords['Toronto, Ontario'])

43.7001099470005

In [24]:
df_bluejays['latitude'] = df_bluejays['location'].apply(lambda x: get_lat(location_coords[x]))
df_bluejays['longitude'] = df_bluejays['location'].apply(lambda x: get_long(location_coords[x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [25]:
df_bluejays.head(20)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude
0,"@bluejays winning record on the road trip, com...",Sun Aug 07 23:59:53 +0000 2016,,0,0,,
1,L #Toronto #BlueJays Toronto Blue Jays Beer St...,Sun Aug 07 23:59:37 +0000 2016,,0,1,,
2,#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,
3,#MLB #Baseball TORONTO #BlueJays 1992 WORLD SE...,Sun Aug 07 23:59:16 +0000 2016,,0,0,,
4,RT @Wilnerness590: #Bluejays 4-3 road trip end...,Sun Aug 07 23:59:01 +0000 2016,,3,0,,
5,#BlueJays return home with a winning road reco...,Sun Aug 07 23:58:50 +0000 2016,,15,9,,
6,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:39 +0000 2016,"Gaming place, GAMING COUNTRY.",10,0,,
7,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:35 +0000 2016,,10,0,,
8,RT @JoseMeowtista: Tabby said the #BlueJays ha...,Sun Aug 07 23:57:02 +0000 2016,,11,0,,
9,#SS #TroyTulowitzki 50 #Card lot with no dupli...,Sun Aug 07 23:55:12 +0000 2016,,0,0,,


In [30]:
#g = geocoder.arcgis(df_redsox['location'][3])
#g.latlng

In [31]:
#df_redsox['location'][:10].apply(geocoder.arcgis)

In [32]:
#print(len(lats))
#print(len(longs))

In [33]:
#df_redsox['latitude'] = pd.Series(lats)
#df_redsox['longitude'] = pd.Series(longs)

In [26]:
df_bluejays['created_at'].dtypes

dtype('O')

In [27]:
df_bluejays['created_at'].head()

0    Sun Aug 07 23:59:53 +0000 2016
1    Sun Aug 07 23:59:37 +0000 2016
2    Sun Aug 07 23:59:18 +0000 2016
3    Sun Aug 07 23:59:16 +0000 2016
4    Sun Aug 07 23:59:01 +0000 2016
Name: created_at, dtype: object

In [28]:
# First, parse to datetime, then get YYYY-MM-DD format.
df_bluejays['time'] = pd.to_datetime(df_bluejays['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')
df_bluejays['time'] = pd.to_datetime(df_bluejays['time'].dt.date, format='%Y-%m-%d')
#date = datetime.strptime('Thu Apr 23 13:38:19 +0000 2009','%a %b %d %H:%M:%S +0000 %Y')
#date = datetime.strftime(date, '%Y %d %m')
#type(date)
#df_redsox['time'].dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [29]:
df_bluejays.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time
0,"@bluejays winning record on the road trip, com...",Sun Aug 07 23:59:53 +0000 2016,,0,0,,,2016-08-07
1,L #Toronto #BlueJays Toronto Blue Jays Beer St...,Sun Aug 07 23:59:37 +0000 2016,,0,1,,,2016-08-07
2,#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,,2016-08-07
3,#MLB #Baseball TORONTO #BlueJays 1992 WORLD SE...,Sun Aug 07 23:59:16 +0000 2016,,0,0,,,2016-08-07
4,RT @Wilnerness590: #Bluejays 4-3 road trip end...,Sun Aug 07 23:59:01 +0000 2016,,3,0,,,2016-08-07
5,#BlueJays return home with a winning road reco...,Sun Aug 07 23:58:50 +0000 2016,,15,9,,,2016-08-07
6,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:39 +0000 2016,"Gaming place, GAMING COUNTRY.",10,0,,,2016-08-07
7,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:35 +0000 2016,,10,0,,,2016-08-07
8,RT @JoseMeowtista: Tabby said the #BlueJays ha...,Sun Aug 07 23:57:02 +0000 2016,,11,0,,,2016-08-07
9,#SS #TroyTulowitzki 50 #Card lot with no dupli...,Sun Aug 07 23:55:12 +0000 2016,,0,0,,,2016-08-07


## Sentiment Analysis ##

In [30]:
df_bluejays.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time
0,"@bluejays winning record on the road trip, com...",Sun Aug 07 23:59:53 +0000 2016,,0,0,,,2016-08-07
1,L #Toronto #BlueJays Toronto Blue Jays Beer St...,Sun Aug 07 23:59:37 +0000 2016,,0,1,,,2016-08-07
2,#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,,2016-08-07
3,#MLB #Baseball TORONTO #BlueJays 1992 WORLD SE...,Sun Aug 07 23:59:16 +0000 2016,,0,0,,,2016-08-07
4,RT @Wilnerness590: #Bluejays 4-3 road trip end...,Sun Aug 07 23:59:01 +0000 2016,,3,0,,,2016-08-07


In [31]:
from textblob import TextBlob

In [32]:
tweets = df_bluejays['text']
sentiments = []
for tweet in tweets:
    wiki = TextBlob(tweet)
    sentiments.append(wiki.sentiment.polarity)

In [33]:
df_bluejays['sentiment'] = pd.Series(sentiments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [34]:
df_bluejays.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time,sentiment
0,"@bluejays winning record on the road trip, com...",Sun Aug 07 23:59:53 +0000 2016,,0,0,,,2016-08-07,0.5
1,L #Toronto #BlueJays Toronto Blue Jays Beer St...,Sun Aug 07 23:59:37 +0000 2016,,0,1,,,2016-08-07,-0.121212
2,#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,,2016-08-07,-0.25
3,#MLB #Baseball TORONTO #BlueJays 1992 WORLD SE...,Sun Aug 07 23:59:16 +0000 2016,,0,0,,,2016-08-07,0.0
4,RT @Wilnerness590: #Bluejays 4-3 road trip end...,Sun Aug 07 23:59:01 +0000 2016,,3,0,,,2016-08-07,0.3
5,#BlueJays return home with a winning road reco...,Sun Aug 07 23:58:50 +0000 2016,,15,9,,,2016-08-07,0.5
6,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:39 +0000 2016,"Gaming place, GAMING COUNTRY.",10,0,,,2016-08-07,-0.15
7,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:35 +0000 2016,,10,0,,,2016-08-07,-0.15
8,RT @JoseMeowtista: Tabby said the #BlueJays ha...,Sun Aug 07 23:57:02 +0000 2016,,11,0,,,2016-08-07,-0.225
9,#SS #TroyTulowitzki 50 #Card lot with no dupli...,Sun Aug 07 23:55:12 +0000 2016,,0,0,,,2016-08-07,0.0


In [223]:
#df_redsox.to_csv('df_redsox.csv')

In [35]:
g = geocoder.arcgis('Toronto, Ontario')
toronto_lat = g.latlng[0]
toronto_long = g.latlng[1]
print(toronto_lat, toronto_long)

43.7001099470005 -79.41629882799964


In [37]:
import gpxpy.geo
import math
haversine_distances = []
for i in range(df_bluejays.shape[0]):
#dist = gpxpy.geo.haversine_distance(52.2296756, 21.0122287, 52.406374, 16.9251681)
#print(dist)
    if math.isnan(df_bluejays.iloc[i]['latitude']):
        haversine_distances.append(np.nan)
    else:
        distance = gpxpy.geo.haversine_distance(toronto_lat, toronto_long, df_bluejays.iloc[i]['latitude'], df_bluejays.iloc[i]['longitude'])
        haversine_distances.append(distance)

In [38]:
len(haversine_distances)

10000

In [39]:
df_bluejays['hav_distance'] = pd.Series(haversine_distances)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [43]:
df_bluejays.head(20)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time,sentiment,hav_distance
0,"@bluejays winning record on the road trip, com...",Sun Aug 07 23:59:53 +0000 2016,,0,0,,,2016-08-07,0.5,
1,L #Toronto #BlueJays Toronto Blue Jays Beer St...,Sun Aug 07 23:59:37 +0000 2016,,0,1,,,2016-08-07,-0.121212,
2,#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,,2016-08-07,-0.25,
3,#MLB #Baseball TORONTO #BlueJays 1992 WORLD SE...,Sun Aug 07 23:59:16 +0000 2016,,0,0,,,2016-08-07,0.0,
4,RT @Wilnerness590: #Bluejays 4-3 road trip end...,Sun Aug 07 23:59:01 +0000 2016,,3,0,,,2016-08-07,0.3,
5,#BlueJays return home with a winning road reco...,Sun Aug 07 23:58:50 +0000 2016,,15,9,,,2016-08-07,0.5,
6,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:39 +0000 2016,"Gaming place, GAMING COUNTRY.",10,0,,,2016-08-07,-0.15,
7,RT @Sportsnet: Road trip comes to sour end for...,Sun Aug 07 23:57:35 +0000 2016,,10,0,,,2016-08-07,-0.15,
8,RT @JoseMeowtista: Tabby said the #BlueJays ha...,Sun Aug 07 23:57:02 +0000 2016,,11,0,,,2016-08-07,-0.225,
9,#SS #TroyTulowitzki 50 #Card lot with no dupli...,Sun Aug 07 23:55:12 +0000 2016,,0,0,,,2016-08-07,0.0,


In [44]:
df_bluejays.rename(columns={'time': 'dates'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [45]:
df_bluejays.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,dates,sentiment,hav_distance
0,"@bluejays winning record on the road trip, com...",Sun Aug 07 23:59:53 +0000 2016,,0,0,,,2016-08-07,0.5,
1,L #Toronto #BlueJays Toronto Blue Jays Beer St...,Sun Aug 07 23:59:37 +0000 2016,,0,1,,,2016-08-07,-0.121212,
2,#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,,2016-08-07,-0.25,
3,#MLB #Baseball TORONTO #BlueJays 1992 WORLD SE...,Sun Aug 07 23:59:16 +0000 2016,,0,0,,,2016-08-07,0.0,
4,RT @Wilnerness590: #Bluejays 4-3 road trip end...,Sun Aug 07 23:59:01 +0000 2016,,3,0,,,2016-08-07,0.3,


In [250]:
#df_mariners.to_csv('df_mariners.csv')

In [46]:
outcome_by_date = {
    '2016-08-07 00:00:00': 'Loss',
    '2016-08-08 00:00:00': 'Win',
    '2016-08-09 00:00:00': 'Loss',
    '2016-08-10 00:00:00': 'Win',
    '2016-08-11 00:00:00': 'No Game',
    '2016-08-12 00:00:00': 'Loss',
    '2016-08-13 00:00:00': 'Win',
    '2016-08-14 00:00:00': 'Win',
    '2016-08-15 00:00:00': 'Loss',
    '2016-08-16 00:00:00': 'Win'
}

In [47]:
# Apply outcome values to new column.
df_bluejays['outcome'] = df_bluejays['dates'].apply(lambda x: outcome_by_date[str(x)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [48]:
df_bluejays.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,dates,sentiment,hav_distance,outcome
0,"@bluejays winning record on the road trip, com...",Sun Aug 07 23:59:53 +0000 2016,,0,0,,,2016-08-07,0.5,,Loss
1,L #Toronto #BlueJays Toronto Blue Jays Beer St...,Sun Aug 07 23:59:37 +0000 2016,,0,1,,,2016-08-07,-0.121212,,Loss
2,#Toronto #BlueJays TORONTO BLUE JAYS 1992 WORL...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,,2016-08-07,-0.25,,Loss
3,#MLB #Baseball TORONTO #BlueJays 1992 WORLD SE...,Sun Aug 07 23:59:16 +0000 2016,,0,0,,,2016-08-07,0.0,,Loss
4,RT @Wilnerness590: #Bluejays 4-3 road trip end...,Sun Aug 07 23:59:01 +0000 2016,,3,0,,,2016-08-07,0.3,,Loss


In [49]:
df_bluejays.to_csv('df_bluejays.csv')