**Note:** place file one level up in directory structure so it can run properly.

# Analyzing Baseball Fan Sentiment with Twitter Data (Yankees) #

In [82]:
import glob
import pandas as pd
import numpy as np

# For working with JSON data.
try:
    import json
except ImportError:
    import simplejson as json
    
# For maps.
from mpl_toolkits.basemap import Basemap
import geocoder
import matplotlib.pyplot as plt
%matplotlib inline

#import warnings
#warnings.simplefilter(action = "ignore", category = FutureWarning)

## Intial Exploration (10,000 Tweets Version) ##

In [83]:
for filename in glob.iglob('data/yankees/*.txt'):
     print(filename)

data/yankees/yankees_search_08_07_1000.txt
data/yankees/yankees_search_08_08_1000.txt
data/yankees/yankees_search_08_09_1000.txt
data/yankees/yankees_search_08_10_1000.txt
data/yankees/yankees_search_08_11_1000.txt
data/yankees/yankees_search_08_12_1000.txt
data/yankees/yankees_search_08_13_1000.txt
data/yankees/yankees_search_08_15_1000.txt
data/yankees/yankees_search_08_16_1000.txt
data/yankees/yankees_stream_08_14_1000.txt


In [84]:
# Read in JSON data and store in pandas DataFrame.
tweets = []
locations = []
for filename in glob.iglob('data/yankees/*.txt'):
    tweets_filename = filename
    tweets_file = open(tweets_filename, "r")
    for line in tweets_file:
        try:
            # Read in each line of file, convert to JSON object. 
            tweet = json.loads(line.strip())
            # Make sure tweet has text content.
            if 'text' in tweet:
                tweets.append(tweet)
                locations.append(tweet['user']['location'])
        except:
            # Skip any non-JSON-formatted data that may have been captured.
            continue

In [85]:
df_yankees_all = pd.DataFrame(tweets)

In [86]:
df_yankees_all['location'] = pd.Series(locations)

In [87]:
df_yankees_all.shape

(10000, 33)

In [88]:
df_yankees_all.head(3)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,filter_level,geo,id,...,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user,location
0,,,Sun Aug 07 23:59:49 +0000 2016,"{'user_mentions': [{'name': 'Ian Rapoport', 's...",,0,False,,,762438126497259520,...,7.62304316887687e+17,235,False,"{'in_reply_to_status_id': None, 'truncated': F...","<a href=""http://twitter.com/download/iphone"" r...",RT @RapSheet: Goodbye to one of the greatest t...,,False,"{'verified': False, 'follow_request_sent': Fal...",Ohio
1,,,Sun Aug 07 23:59:45 +0000 2016,"{'user_mentions': [], 'symbols': [], 'hashtags...",,2,False,,,762438110500093952,...,7.624375932334449e+17,0,False,,"<a href=""https://about.twitter.com/products/tw...",Might be the dumbest thing I’ve ever read. Exp...,,False,"{'verified': False, 'follow_request_sent': Fal...","ÜT: 41.271945,-73.737193"
2,,,Sun Aug 07 23:59:38 +0000 2016,"{'media': [{'indices': [114, 137], 'url': 'htt...","{'media': [{'indices': [114, 137], 'expanded_u...",1,False,,,762438082087915520,...,,0,False,,"<a href=""www.zeroslant.com"" rel=""nofollow"">Zer...",Quick Take: New York #Yankees defeat Cleveland...,,False,"{'verified': False, 'follow_request_sent': Fal...",SF & LA


In [89]:
#df_redsox_all['retweet_count'].value_counts()

In [90]:
df_yankees = df_yankees_all[['text', 'created_at', 'location', 'retweet_count', 'favorite_count']]

In [91]:
df_yankees.shape

(10000, 5)

In [92]:
df_yankees.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count
0,RT @RapSheet: Goodbye to one of the greatest t...,Sun Aug 07 23:59:49 +0000 2016,Ohio,235,0
1,Might be the dumbest thing I’ve ever read. Exp...,Sun Aug 07 23:59:45 +0000 2016,"ÜT: 41.271945,-73.737193",0,2
2,Quick Take: New York #Yankees defeat Cleveland...,Sun Aug 07 23:59:38 +0000 2016,SF & LA,0,1
3,RT @MikeyAdams420: Kiss yourself goodbye @AROD...,Sun Aug 07 23:59:35 +0000 2016,,16,0
4,Congratulations Ichiro #Ichiro3000 #Mariners #...,Sun Aug 07 23:59:32 +0000 2016,"Northern Virginia, USA",0,1
5,"RT @JeopardySports: ""Who is: Alex Rodriguez?""\...",Sun Aug 07 23:59:31 +0000 2016,,250,0
6,RT: *Mature Audiences Only!*\n#WWE #IronSheik ...,Sun Aug 07 23:59:26 +0000 2016,"Boston, MA",1,0
7,TIX https://t.co/xvncbRb1y0 2x New York #Yanke...,Sun Aug 07 23:59:19 +0000 2016,,0,0
8,TIX https://t.co/xvncbRb1y0 L L L NEW YORK #Ya...,Sun Aug 07 23:59:18 +0000 2016,,0,0
9,"RT @JeopardySports: ""Who is: Alex Rodriguez?""\...",Sun Aug 07 23:59:09 +0000 2016,"Hamilton,Ontario",250,0


In [94]:
# DONT USE
lats = []
longs = []
for location in df_redsox['location']:
    g = geocoder.arcgis(location)
    if g.latlng != []:
        lats.append(g.latlng[0])
        longs.append(g.latlng[1])
    else:
        lats.append(np.nan)
        longs.append(np.nan)

In [93]:
unique_locations = df_yankees['location'].unique()

In [94]:
len(unique_locations)
#unique_locations = unique_locations[:10]
#unique_locations

2955

In [38]:
# USE THIS
location_coords = {}
for location in unique_locations:
    location_coords[location] = geocoder.arcgis(location)

In [95]:
location_coords[''].latlng

[]

In [96]:
def get_lat(x):
    if x.latlng != []:
        return(x.latlng[0])
    else:
        return(np.nan)

In [97]:
def get_long(x):
    if x.latlng != []:
        return(x.latlng[1])
    else:
        return(np.nan)

In [98]:
get_lat(location_coords['New York, NY'])

40.71426940400045

In [99]:
df_yankees['latitude'] = df_yankees['location'].apply(lambda x: get_lat(location_coords[x]))
df_yankees['longitude'] = df_yankees['location'].apply(lambda x: get_long(location_coords[x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [100]:
df_yankees.head(20)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude
0,RT @RapSheet: Goodbye to one of the greatest t...,Sun Aug 07 23:59:49 +0000 2016,Ohio,235,0,40.250338,-83.000177
1,Might be the dumbest thing I’ve ever read. Exp...,Sun Aug 07 23:59:45 +0000 2016,"ÜT: 41.271945,-73.737193",0,2,,
2,Quick Take: New York #Yankees defeat Cleveland...,Sun Aug 07 23:59:38 +0000 2016,SF & LA,0,1,14.133328,-89.816667
3,RT @MikeyAdams420: Kiss yourself goodbye @AROD...,Sun Aug 07 23:59:35 +0000 2016,,16,0,,
4,Congratulations Ichiro #Ichiro3000 #Mariners #...,Sun Aug 07 23:59:32 +0000 2016,"Northern Virginia, USA",0,1,38.757069,-77.538489
5,"RT @JeopardySports: ""Who is: Alex Rodriguez?""\...",Sun Aug 07 23:59:31 +0000 2016,,250,0,,
6,RT: *Mature Audiences Only!*\n#WWE #IronSheik ...,Sun Aug 07 23:59:26 +0000 2016,"Boston, MA",1,0,42.358428,-71.059766
7,TIX https://t.co/xvncbRb1y0 2x New York #Yanke...,Sun Aug 07 23:59:19 +0000 2016,,0,0,,
8,TIX https://t.co/xvncbRb1y0 L L L NEW YORK #Ya...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,
9,"RT @JeopardySports: ""Who is: Alex Rodriguez?""\...",Sun Aug 07 23:59:09 +0000 2016,"Hamilton,Ontario",250,0,43.233407,-79.949637


In [30]:
#g = geocoder.arcgis(df_redsox['location'][3])
#g.latlng

In [31]:
#df_redsox['location'][:10].apply(geocoder.arcgis)

In [32]:
#print(len(lats))
#print(len(longs))

In [33]:
#df_redsox['latitude'] = pd.Series(lats)
#df_redsox['longitude'] = pd.Series(longs)

In [101]:
df_yankees['created_at'].dtypes

dtype('O')

In [102]:
df_yankees['created_at'].head()

0    Sun Aug 07 23:59:49 +0000 2016
1    Sun Aug 07 23:59:45 +0000 2016
2    Sun Aug 07 23:59:38 +0000 2016
3    Sun Aug 07 23:59:35 +0000 2016
4    Sun Aug 07 23:59:32 +0000 2016
Name: created_at, dtype: object

In [103]:
# First, parse to datetime, then get YYYY-MM-DD format.
df_yankees['time'] = pd.to_datetime(df_yankees['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')
df_yankees['time'] = pd.to_datetime(df_yankees['time'].dt.date, format='%Y-%m-%d')
#date = datetime.strptime('Thu Apr 23 13:38:19 +0000 2009','%a %b %d %H:%M:%S +0000 %Y')
#date = datetime.strftime(date, '%Y %d %m')
#type(date)
#df_redsox['time'].dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [104]:
df_yankees.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time
0,RT @RapSheet: Goodbye to one of the greatest t...,Sun Aug 07 23:59:49 +0000 2016,Ohio,235,0,40.250338,-83.000177,2016-08-07
1,Might be the dumbest thing I’ve ever read. Exp...,Sun Aug 07 23:59:45 +0000 2016,"ÜT: 41.271945,-73.737193",0,2,,,2016-08-07
2,Quick Take: New York #Yankees defeat Cleveland...,Sun Aug 07 23:59:38 +0000 2016,SF & LA,0,1,14.133328,-89.816667,2016-08-07
3,RT @MikeyAdams420: Kiss yourself goodbye @AROD...,Sun Aug 07 23:59:35 +0000 2016,,16,0,,,2016-08-07
4,Congratulations Ichiro #Ichiro3000 #Mariners #...,Sun Aug 07 23:59:32 +0000 2016,"Northern Virginia, USA",0,1,38.757069,-77.538489,2016-08-07
5,"RT @JeopardySports: ""Who is: Alex Rodriguez?""\...",Sun Aug 07 23:59:31 +0000 2016,,250,0,,,2016-08-07
6,RT: *Mature Audiences Only!*\n#WWE #IronSheik ...,Sun Aug 07 23:59:26 +0000 2016,"Boston, MA",1,0,42.358428,-71.059766,2016-08-07
7,TIX https://t.co/xvncbRb1y0 2x New York #Yanke...,Sun Aug 07 23:59:19 +0000 2016,,0,0,,,2016-08-07
8,TIX https://t.co/xvncbRb1y0 L L L NEW YORK #Ya...,Sun Aug 07 23:59:18 +0000 2016,,0,0,,,2016-08-07
9,"RT @JeopardySports: ""Who is: Alex Rodriguez?""\...",Sun Aug 07 23:59:09 +0000 2016,"Hamilton,Ontario",250,0,43.233407,-79.949637,2016-08-07


In [106]:
df_yankees['time'].value_counts()

2016-08-09    1000
2016-08-12    1000
2016-08-07    1000
2016-08-15    1000
2016-08-10    1000
2016-08-13    1000
2016-08-08    1000
2016-08-16    1000
2016-08-11    1000
2016-08-14    1000
Name: time, dtype: int64

## Sentiment Analysis ##

In [107]:
df_yankees.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time
0,RT @RapSheet: Goodbye to one of the greatest t...,Sun Aug 07 23:59:49 +0000 2016,Ohio,235,0,40.250338,-83.000177,2016-08-07
1,Might be the dumbest thing I’ve ever read. Exp...,Sun Aug 07 23:59:45 +0000 2016,"ÜT: 41.271945,-73.737193",0,2,,,2016-08-07
2,Quick Take: New York #Yankees defeat Cleveland...,Sun Aug 07 23:59:38 +0000 2016,SF & LA,0,1,14.133328,-89.816667,2016-08-07
3,RT @MikeyAdams420: Kiss yourself goodbye @AROD...,Sun Aug 07 23:59:35 +0000 2016,,16,0,,,2016-08-07
4,Congratulations Ichiro #Ichiro3000 #Mariners #...,Sun Aug 07 23:59:32 +0000 2016,"Northern Virginia, USA",0,1,38.757069,-77.538489,2016-08-07


In [108]:
from textblob import TextBlob

In [109]:
tweets = df_yankees['text']
sentiments = []
for tweet in tweets:
    wiki = TextBlob(tweet)
    sentiments.append(wiki.sentiment.polarity)

In [110]:
df_yankees['sentiment'] = pd.Series(sentiments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [111]:
df_yankees.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time,sentiment
0,RT @RapSheet: Goodbye to one of the greatest t...,Sun Aug 07 23:59:49 +0000 2016,Ohio,235,0,40.250338,-83.000177,2016-08-07,0.0
1,Might be the dumbest thing I’ve ever read. Exp...,Sun Aug 07 23:59:45 +0000 2016,"ÜT: 41.271945,-73.737193",0,2,,,2016-08-07,-0.131944
2,Quick Take: New York #Yankees defeat Cleveland...,Sun Aug 07 23:59:38 +0000 2016,SF & LA,0,1,14.133328,-89.816667,2016-08-07,0.234848
3,RT @MikeyAdams420: Kiss yourself goodbye @AROD...,Sun Aug 07 23:59:35 +0000 2016,,16,0,,,2016-08-07,0.7
4,Congratulations Ichiro #Ichiro3000 #Mariners #...,Sun Aug 07 23:59:32 +0000 2016,"Northern Virginia, USA",0,1,38.757069,-77.538489,2016-08-07,0.0


In [223]:
#df_redsox.to_csv('df_redsox.csv')

In [113]:
g = geocoder.arcgis('New York, NY')
ny_lat = g.latlng[0]
ny_long = g.latlng[1]
print(ny_lat, ny_long)

40.71426940400045 -74.00596992899966


In [114]:
import gpxpy.geo
import math
haversine_distances = []
for i in range(df_yankees.shape[0]):
#dist = gpxpy.geo.haversine_distance(52.2296756, 21.0122287, 52.406374, 16.9251681)
#print(dist)
    if math.isnan(df_yankees.iloc[i]['latitude']):
        haversine_distances.append(np.nan)
    else:
        distance = gpxpy.geo.haversine_distance(ny_lat, ny_long, df_yankees.iloc[i]['latitude'], df_yankees.iloc[i]['longitude'])
        haversine_distances.append(distance)

In [115]:
len(haversine_distances)

10000

In [116]:
df_yankees['hav_distance'] = pd.Series(haversine_distances)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [117]:
df_yankees.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time,sentiment,hav_distance
0,RT @RapSheet: Goodbye to one of the greatest t...,Sun Aug 07 23:59:49 +0000 2016,Ohio,235,0,40.250338,-83.000177,2016-08-07,0.0,762102.0
1,Might be the dumbest thing I’ve ever read. Exp...,Sun Aug 07 23:59:45 +0000 2016,"ÜT: 41.271945,-73.737193",0,2,,,2016-08-07,-0.131944,
2,Quick Take: New York #Yankees defeat Cleveland...,Sun Aug 07 23:59:38 +0000 2016,SF & LA,0,1,14.133328,-89.816667,2016-08-07,0.234848,3329860.0
3,RT @MikeyAdams420: Kiss yourself goodbye @AROD...,Sun Aug 07 23:59:35 +0000 2016,,16,0,,,2016-08-07,0.7,
4,Congratulations Ichiro #Ichiro3000 #Mariners #...,Sun Aug 07 23:59:32 +0000 2016,"Northern Virginia, USA",0,1,38.757069,-77.538489,2016-08-07,0.0,372244.3


In [119]:
df_yankees.rename(columns={'time': 'dates'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [120]:
df_yankees.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,dates,sentiment,hav_distance
0,RT @RapSheet: Goodbye to one of the greatest t...,Sun Aug 07 23:59:49 +0000 2016,Ohio,235,0,40.250338,-83.000177,2016-08-07,0.0,762102.0
1,Might be the dumbest thing I’ve ever read. Exp...,Sun Aug 07 23:59:45 +0000 2016,"ÜT: 41.271945,-73.737193",0,2,,,2016-08-07,-0.131944,
2,Quick Take: New York #Yankees defeat Cleveland...,Sun Aug 07 23:59:38 +0000 2016,SF & LA,0,1,14.133328,-89.816667,2016-08-07,0.234848,3329860.0
3,RT @MikeyAdams420: Kiss yourself goodbye @AROD...,Sun Aug 07 23:59:35 +0000 2016,,16,0,,,2016-08-07,0.7,
4,Congratulations Ichiro #Ichiro3000 #Mariners #...,Sun Aug 07 23:59:32 +0000 2016,"Northern Virginia, USA",0,1,38.757069,-77.538489,2016-08-07,0.0,372244.3


In [124]:
df_yankees['dates'].value_counts()

2016-08-09    1000
2016-08-12    1000
2016-08-07    1000
2016-08-15    1000
2016-08-10    1000
2016-08-13    1000
2016-08-08    1000
2016-08-16    1000
2016-08-11    1000
2016-08-14    1000
Name: dates, dtype: int64

In [122]:
df_yankees.to_csv('df_yankees.csv')

In [52]:
outcome_by_date = {
    '2016-08-07 00:00:00': 'Win',
    '2016-08-08 00:00:00': 'No Game',
    '2016-08-09 00:00:00': 'Loss',
    '2016-08-10 00:00:00': 'Win',
    '2016-08-11 00:00:00': 'Win',
    '2016-08-12 00:00:00': 'Win',
    '2016-08-13 00:00:00': 'Win',
    '2016-08-14 00:00:00': 'Loss',
    '2016-08-15 00:00:00': 'Win',
    '2016-08-16 00:00:00': 'Loss'
}

In [53]:
# Apply outcome values to new column.
df_yankees['outcome'] = df_yankees['dates'].apply(lambda x: outcome_by_date[str(x)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [54]:
df_yankees.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,dates,sentiment,hav_distance,outcome
0,RT @RapSheet: Goodbye to one of the greatest t...,Sun Aug 07 23:59:49 +0000 2016,Ohio,235,0,40.250338,-83.000177,2016-08-07,0.0,762102.0,Win
1,Might be the dumbest thing I’ve ever read. Exp...,Sun Aug 07 23:59:45 +0000 2016,"ÜT: 41.271945,-73.737193",0,2,,,2016-08-07,-0.131944,,Win
2,Quick Take: New York #Yankees defeat Cleveland...,Sun Aug 07 23:59:38 +0000 2016,SF & LA,0,1,14.133328,-89.816667,2016-08-07,0.234848,3329860.0,Win
3,RT @MikeyAdams420: Kiss yourself goodbye @AROD...,Sun Aug 07 23:59:35 +0000 2016,,16,0,,,2016-08-07,0.7,,Win
4,Congratulations Ichiro #Ichiro3000 #Mariners #...,Sun Aug 07 23:59:32 +0000 2016,"Northern Virginia, USA",0,1,38.757069,-77.538489,2016-08-07,0.0,372244.3,Win


In [None]:
df_yankees['dates'].value_counts()

In [55]:
df_yankees.to_csv('df_yankees.csv')

In [None]:
df_yankees.rename(columns={'hav_distance': 'hav_distance_ny'}, inplace=True)