**Note:** place file one level up in directory structure so it can run properly.

# Analyzing Baseball Fan Sentiment with Twitter Data (Mariners) #

In [1]:
import glob
import pandas as pd
import numpy as np

# For working with JSON data.
try:
    import json
except ImportError:
    import simplejson as json
    
# For maps.
from mpl_toolkits.basemap import Basemap
import geocoder
import matplotlib.pyplot as plt
%matplotlib inline

#import warnings
#warnings.simplefilter(action = "ignore", category = FutureWarning)

## Intial Exploration (10,000 Tweets Version) ##

In [2]:
for filename in glob.iglob('data/mariners/*.txt'):
     print(filename)

data/mariners/mariners_search_08_06_1000.txt
data/mariners/mariners_search_08_07_1000.txt
data/mariners/mariners_search_08_08_1000.txt
data/mariners/mariners_search_08_09_1000.txt
data/mariners/mariners_search_08_10_1000.txt
data/mariners/mariners_search_08_11_1000.txt
data/mariners/mariners_search_08_12_1000.txt
data/mariners/mariners_search_08_13_1000.txt
data/mariners/mariners_search_08_16_1000.txt
data/mariners/mariners_stream_08_14_1000.txt
data/mariners/mariners_stream_08_15_1000.txt


In [3]:
# Read in JSON data and store in pandas DataFrame.
tweets = []
locations = []
for filename in glob.iglob('data/mariners/*.txt'):
    tweets_filename = filename
    tweets_file = open(tweets_filename, "r")
    for line in tweets_file:
        try:
            # Read in each line of file, convert to JSON object. 
            tweet = json.loads(line.strip())
            # Make sure tweet has text content.
            if 'text' in tweet:
                tweets.append(tweet)
                locations.append(tweet['user']['location'])
        except:
            # Skip any non-JSON-formatted data that may have been captured.
            continue

In [4]:
df_mariners_all = pd.DataFrame(tweets)

In [5]:
df_mariners_all['location'] = pd.Series(locations)

In [6]:
df_mariners_all.shape

(11000, 33)

In [7]:
df_mariners_all.head(3)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,filter_level,geo,id,...,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user,location
0,,,Sat Aug 06 23:59:58 +0000 2016,"{'media': [{'indices': [126, 140], 'id': 76205...","{'media': [{'indices': [126, 140], 'id': 76205...",0,False,,,762075777093382144,...,,1039,False,"{'in_reply_to_status_id_str': None, 'contribut...","<a href=""http://twitter.com/download/android"" ...",RT @Mariners: Batting practice today is the ve...,,False,"{'favourites_count': 17788, 'geo_enabled': Tru...","Provo, UT"
1,,,Sat Aug 06 23:59:51 +0000 2016,"{'media': [{'indices': [126, 140], 'id': 76205...","{'media': [{'indices': [126, 140], 'id': 76205...",0,False,,,762075745506111488,...,,1039,False,"{'in_reply_to_status_id_str': None, 'contribut...","<a href=""http://twitter.com/download/iphone"" r...",RT @Mariners: Batting practice today is the ve...,,False,"{'favourites_count': 60394, 'geo_enabled': Tru...",„Åù„Åì„Çâ„Å∏„Çì
2,,,Sat Aug 06 23:59:40 +0000 2016,{'media': [{'media_url_https': 'https://pbs.tw...,{'media': [{'media_url_https': 'https://pbs.tw...,0,False,,,762075699955978240,...,,0,False,,"<a href=""http://dlvr.it"" rel=""nofollow"">dlvr.i...",Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,,False,"{'favourites_count': 0, 'geo_enabled': False, ...",All Round The World


In [190]:
#df_redsox_all['retweet_count'].value_counts()

In [8]:
df_mariners = df_mariners_all[['text', 'created_at', 'location', 'retweet_count', 'favorite_count']]

In [9]:
df_mariners.shape

(11000, 5)

In [10]:
df_mariners.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count
0,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:58 +0000 2016,"Provo, UT",1039,0
1,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:51 +0000 2016,„Åù„Åì„Çâ„Å∏„Çì,1039,0
2,Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,Sat Aug 06 23:59:40 +0000 2016,All Round The World,0,0
3,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:35 +0000 2016,Cincinnati,1039,0
4,RT @710ESPNSeattle: Tonight's the night.\n#24E...,Sat Aug 06 23:59:30 +0000 2016,"Bothell, WA",21,0
5,"Ken Griffey, Jr.'s number flies high above #Se...",Sat Aug 06 23:59:22 +0000 2016,"Seattle, WA",5,15
6,I flew home just in time to watch my hometown ...,Sat Aug 06 23:59:20 +0000 2016,,0,7
7,Safeco looking like not even a playoff atmosph...,Sat Aug 06 23:59:20 +0000 2016,RP,0,0
8,"MLB: *FREE* RISK $300 MARLINS -ML +116\n\n$1,0...",Sat Aug 06 23:59:16 +0000 2016,Incarceratedbob@Gmail.com,3,1
9,Family night at the #mariners! #24ever #baseba...,Sat Aug 06 23:59:13 +0000 2016,,0,0


In [94]:
# DONT USE
lats = []
longs = []
for location in df_redsox['location']:
    g = geocoder.arcgis(location)
    if g.latlng != []:
        lats.append(g.latlng[0])
        longs.append(g.latlng[1])
    else:
        lats.append(np.nan)
        longs.append(np.nan)

In [11]:
unique_locations = df_mariners['location'].unique()

In [12]:
len(unique_locations)
#unique_locations = unique_locations[:10]
#unique_locations

2459

In [13]:
# USE THIS
location_coords = {}
for location in unique_locations:
    location_coords[location] = geocoder.arcgis(location)

In [14]:
location_coords[''].latlng

[]

In [15]:
def get_lat(x):
    if x.latlng != []:
        return(x.latlng[0])
    else:
        return(np.nan)

In [16]:
def get_long(x):
    if x.latlng != []:
        return(x.latlng[1])
    else:
        return(np.nan)

In [17]:
get_lat(location_coords['Seattle, WA'])

47.60620882800049

In [18]:
df_mariners['latitude'] = df_mariners['location'].apply(lambda x: get_lat(location_coords[x]))
df_mariners['longitude'] = df_mariners['location'].apply(lambda x: get_long(location_coords[x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [19]:
df_mariners.head(20)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude
0,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:58 +0000 2016,"Provo, UT",1039,0,40.233838,-111.65853
1,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:51 +0000 2016,„Åù„Åì„Çâ„Å∏„Çì,1039,0,,
2,Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,Sat Aug 06 23:59:40 +0000 2016,All Round The World,0,0,,
3,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:35 +0000 2016,Cincinnati,1039,0,39.161997,-84.456887
4,RT @710ESPNSeattle: Tonight's the night.\n#24E...,Sat Aug 06 23:59:30 +0000 2016,"Bothell, WA",21,0,47.762318,-122.205396
5,"Ken Griffey, Jr.'s number flies high above #Se...",Sat Aug 06 23:59:22 +0000 2016,"Seattle, WA",5,15,47.606209,-122.332069
6,I flew home just in time to watch my hometown ...,Sat Aug 06 23:59:20 +0000 2016,,0,7,,
7,Safeco looking like not even a playoff atmosph...,Sat Aug 06 23:59:20 +0000 2016,RP,0,0,10.065077,-69.338678
8,"MLB: *FREE* RISK $300 MARLINS -ML +116\n\n$1,0...",Sat Aug 06 23:59:16 +0000 2016,Incarceratedbob@Gmail.com,3,1,,
9,Family night at the #mariners! #24ever #baseba...,Sat Aug 06 23:59:13 +0000 2016,,0,0,,


In [30]:
#g = geocoder.arcgis(df_redsox['location'][3])
#g.latlng

In [31]:
#df_redsox['location'][:10].apply(geocoder.arcgis)

In [32]:
#print(len(lats))
#print(len(longs))

In [33]:
#df_redsox['latitude'] = pd.Series(lats)
#df_redsox['longitude'] = pd.Series(longs)

In [20]:
df_mariners['created_at'].dtypes

dtype('O')

In [21]:
df_mariners['created_at'].head()

0    Sat Aug 06 23:59:58 +0000 2016
1    Sat Aug 06 23:59:51 +0000 2016
2    Sat Aug 06 23:59:40 +0000 2016
3    Sat Aug 06 23:59:35 +0000 2016
4    Sat Aug 06 23:59:30 +0000 2016
Name: created_at, dtype: object

In [22]:
# First, parse to datetime, then get YYYY-MM-DD format.
df_mariners['time'] = pd.to_datetime(df_mariners['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')
df_mariners['time'] = pd.to_datetime(df_mariners['time'].dt.date, format='%Y-%m-%d')
#date = datetime.strptime('Thu Apr 23 13:38:19 +0000 2009','%a %b %d %H:%M:%S +0000 %Y')
#date = datetime.strftime(date, '%Y %d %m')
#type(date)
#df_redsox['time'].dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [23]:
df_mariners.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time
0,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:58 +0000 2016,"Provo, UT",1039,0,40.233838,-111.65853,2016-08-06
1,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:51 +0000 2016,„Åù„Åì„Çâ„Å∏„Çì,1039,0,,,2016-08-06
2,Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,Sat Aug 06 23:59:40 +0000 2016,All Round The World,0,0,,,2016-08-06
3,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:35 +0000 2016,Cincinnati,1039,0,39.161997,-84.456887,2016-08-06
4,RT @710ESPNSeattle: Tonight's the night.\n#24E...,Sat Aug 06 23:59:30 +0000 2016,"Bothell, WA",21,0,47.762318,-122.205396,2016-08-06
5,"Ken Griffey, Jr.'s number flies high above #Se...",Sat Aug 06 23:59:22 +0000 2016,"Seattle, WA",5,15,47.606209,-122.332069,2016-08-06
6,I flew home just in time to watch my hometown ...,Sat Aug 06 23:59:20 +0000 2016,,0,7,,,2016-08-06
7,Safeco looking like not even a playoff atmosph...,Sat Aug 06 23:59:20 +0000 2016,RP,0,0,10.065077,-69.338678,2016-08-06
8,"MLB: *FREE* RISK $300 MARLINS -ML +116\n\n$1,0...",Sat Aug 06 23:59:16 +0000 2016,Incarceratedbob@Gmail.com,3,1,,,2016-08-06
9,Family night at the #mariners! #24ever #baseba...,Sat Aug 06 23:59:13 +0000 2016,,0,0,,,2016-08-06


## Sentiment Analysis ##

In [24]:
df_mariners.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time
0,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:58 +0000 2016,"Provo, UT",1039,0,40.233838,-111.65853,2016-08-06
1,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:51 +0000 2016,„Åù„Åì„Çâ„Å∏„Çì,1039,0,,,2016-08-06
2,Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,Sat Aug 06 23:59:40 +0000 2016,All Round The World,0,0,,,2016-08-06
3,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:35 +0000 2016,Cincinnati,1039,0,39.161997,-84.456887,2016-08-06
4,RT @710ESPNSeattle: Tonight's the night.\n#24E...,Sat Aug 06 23:59:30 +0000 2016,"Bothell, WA",21,0,47.762318,-122.205396,2016-08-06


In [25]:
from textblob import TextBlob

In [26]:
tweets = df_mariners['text']
sentiments = []
for tweet in tweets:
    wiki = TextBlob(tweet)
    sentiments.append(wiki.sentiment.polarity)

In [27]:
df_mariners['sentiment'] = pd.Series(sentiments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [31]:
df_mariners.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time,sentiment
0,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:58 +0000 2016,"Provo, UT",1039,0,40.233838,-111.65853,2016-08-06,0.0
1,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:51 +0000 2016,„Åù„Åì„Çâ„Å∏„Çì,1039,0,,,2016-08-06,0.0
2,Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,Sat Aug 06 23:59:40 +0000 2016,All Round The World,0,0,,,2016-08-06,0.0
3,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:35 +0000 2016,Cincinnati,1039,0,39.161997,-84.456887,2016-08-06,0.0
4,RT @710ESPNSeattle: Tonight's the night.\n#24E...,Sat Aug 06 23:59:30 +0000 2016,"Bothell, WA",21,0,47.762318,-122.205396,2016-08-06,0.0
5,"Ken Griffey, Jr.'s number flies high above #Se...",Sat Aug 06 23:59:22 +0000 2016,"Seattle, WA",5,15,47.606209,-122.332069,2016-08-06,0.053333
6,I flew home just in time to watch my hometown ...,Sat Aug 06 23:59:20 +0000 2016,,0,7,,,2016-08-06,0.0
7,Safeco looking like not even a playoff atmosph...,Sat Aug 06 23:59:20 +0000 2016,RP,0,0,10.065077,-69.338678,2016-08-06,0.0
8,"MLB: *FREE* RISK $300 MARLINS -ML +116\n\n$1,0...",Sat Aug 06 23:59:16 +0000 2016,Incarceratedbob@Gmail.com,3,1,,,2016-08-06,0.366667
9,Family night at the #mariners! #24ever #baseba...,Sat Aug 06 23:59:13 +0000 2016,,0,0,,,2016-08-06,0.0


In [223]:
#df_redsox.to_csv('df_redsox.csv')

In [32]:
g = geocoder.arcgis('Seattle, WA')
seattle_lat = g.latlng[0]
seattle_long = g.latlng[1]
print(seattle_lat, seattle_long)

47.60620882800049 -122.33206857399972


In [33]:
import gpxpy.geo
import math
haversine_distances = []
for i in range(df_mariners.shape[0]):
#dist = gpxpy.geo.haversine_distance(52.2296756, 21.0122287, 52.406374, 16.9251681)
#print(dist)
    if math.isnan(df_mariners.iloc[i]['latitude']):
        haversine_distances.append(np.nan)
    else:
        distance = gpxpy.geo.haversine_distance(seattle_lat, seattle_long, df_mariners.iloc[i]['latitude'], df_mariners.iloc[i]['longitude'])
        haversine_distances.append(distance)

In [34]:
len(haversine_distances)

11000

In [35]:
df_mariners['hav_distance'] = pd.Series(haversine_distances)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [36]:
df_mariners.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time,sentiment,hav_distance
0,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:58 +0000 2016,"Provo, UT",1039,0,40.233838,-111.65853,2016-08-06,0.0,1182384.0
1,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:51 +0000 2016,„Åù„Åì„Çâ„Å∏„Çì,1039,0,,,2016-08-06,0.0,
2,Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,Sat Aug 06 23:59:40 +0000 2016,All Round The World,0,0,,,2016-08-06,0.0,
3,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:35 +0000 2016,Cincinnati,1039,0,39.161997,-84.456887,2016-08-06,0.0,3166099.0
4,RT @710ESPNSeattle: Tonight's the night.\n#24E...,Sat Aug 06 23:59:30 +0000 2016,"Bothell, WA",21,0,47.762318,-122.205396,2016-08-06,0.0,19779.7


In [37]:
df_mariners.rename(columns={'time': 'dates'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [38]:
df_mariners.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,dates,sentiment,hav_distance
0,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:58 +0000 2016,"Provo, UT",1039,0,40.233838,-111.65853,2016-08-06,0.0,1182384.0
1,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:51 +0000 2016,„Åù„Åì„Çâ„Å∏„Çì,1039,0,,,2016-08-06,0.0,
2,Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,Sat Aug 06 23:59:40 +0000 2016,All Round The World,0,0,,,2016-08-06,0.0,
3,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:35 +0000 2016,Cincinnati,1039,0,39.161997,-84.456887,2016-08-06,0.0,3166099.0
4,RT @710ESPNSeattle: Tonight's the night.\n#24E...,Sat Aug 06 23:59:30 +0000 2016,"Bothell, WA",21,0,47.762318,-122.205396,2016-08-06,0.0,19779.7


In [250]:
#df_mariners.to_csv('df_mariners.csv')

In [40]:
outcome_by_date = {
    '2016-08-06 00:00:00': 'Win',
    '2016-08-07 00:00:00': 'Win',
    '2016-08-08 00:00:00': 'Win',
    '2016-08-09 00:00:00': 'Win',
    '2016-08-10 00:00:00': 'Win',
    '2016-08-11 00:00:00': 'No Game',
    '2016-08-12 00:00:00': 'Loss',
    '2016-08-13 00:00:00': 'Win',
    '2016-08-14 00:00:00': 'Win',
    '2016-08-15 00:00:00': 'Win',
    '2016-08-16 00:00:00': 'Loss'
}

In [41]:
# Apply outcome values to new column.
df_mariners['outcome'] = df_mariners['dates'].apply(lambda x: outcome_by_date[str(x)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [42]:
df_mariners.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,dates,sentiment,hav_distance,outcome
0,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:58 +0000 2016,"Provo, UT",1039,0,40.233838,-111.65853,2016-08-06,0.0,1182384.0,Win
1,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:51 +0000 2016,„Åù„Åì„Çâ„Å∏„Çì,1039,0,,,2016-08-06,0.0,,Win
2,Tim Lincecum designated for‚Ä¶ https://t.co/NwUU...,Sat Aug 06 23:59:40 +0000 2016,All Round The World,0,0,,,2016-08-06,0.0,,Win
3,RT @Mariners: Batting practice today is the ve...,Sat Aug 06 23:59:35 +0000 2016,Cincinnati,1039,0,39.161997,-84.456887,2016-08-06,0.0,3166099.0,Win
4,RT @710ESPNSeattle: Tonight's the night.\n#24E...,Sat Aug 06 23:59:30 +0000 2016,"Bothell, WA",21,0,47.762318,-122.205396,2016-08-06,0.0,19779.7,Win


In [43]:
df_mariners.to_csv('df_mariners.csv')