**Note:** place file one level up in directory structure so it can run properly.

# Analyzing Baseball Fan Sentiment with Twitter Data (Whitesox) #

In [1]:
import glob
import pandas as pd
import numpy as np

# For working with JSON data.
try:
    import json
except ImportError:
    import simplejson as json
    
# For maps.
from mpl_toolkits.basemap import Basemap
import geocoder
import matplotlib.pyplot as plt
%matplotlib inline

#import warnings
#warnings.simplefilter(action = "ignore", category = FutureWarning)

## Intial Exploration (10,000 Tweets Version) ##

In [2]:
for filename in glob.iglob('data/whitesox/*.txt'):
     print(filename)

data/whitesox/whitesox_search_08_07_1000.txt
data/whitesox/whitesox_search_08_08_1000.txt
data/whitesox/whitesox_search_08_09_1000.txt
data/whitesox/whitesox_search_08_10_1000.txt
data/whitesox/whitesox_search_08_11_1000.txt
data/whitesox/whitesox_search_08_12_1000.txt
data/whitesox/whitesox_search_08_13_1000.txt
data/whitesox/whitesox_search_08_14_1000.txt
data/whitesox/whitesox_search_08_15_1000.txt
data/whitesox/whitesox_search_08_16_1000.txt


In [3]:
# Read in JSON data and store in pandas DataFrame.
tweets = []
locations = []
for filename in glob.iglob('data/whitesox/*.txt'):
    tweets_filename = filename
    tweets_file = open(tweets_filename, "r")
    for line in tweets_file:
        try:
            # Read in each line of file, convert to JSON object. 
            tweet = json.loads(line.strip())
            # Make sure tweet has text content.
            if 'text' in tweet:
                tweets.append(tweet)
                locations.append(tweet['user']['location'])
        except:
            # Skip any non-JSON-formatted data that may have been captured.
            continue

In [4]:
df_whitesox_all = pd.DataFrame(tweets)

In [5]:
df_whitesox_all['location'] = pd.Series(locations)

In [6]:
df_whitesox_all.shape

(10000, 31)

In [7]:
df_whitesox_all.head(3)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,id_str,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user,location
0,,"{'coordinates': [-87.63381504, 41.83000199], '...",Sun Aug 07 23:59:05 +0000 2016,"{'urls': [{'indices': [96, 119], 'expanded_url...",,1,False,"{'coordinates': [41.83000199, -87.63381504], '...",762437941259939841,762437941259939841,...,,,0,False,,"<a href=""http://instagram.com"" rel=""nofollow"">...","Last weekend Cubs game, this weekend Sox game...",False,"{'id_str': '18255890', 'url': 'https://t.co/Ig...","Chicago, IL"
1,,,Sun Aug 07 23:56:26 +0000 2016,"{'urls': [], 'hashtags': [], 'symbols': [], 'm...","{'media': [{'source_user_id_str': '53197137', ...",0,False,,762437275988832256,762437275988832256,...,,,31,False,"{'text': 'Happy birthday, Tommy Kahnle! Show t...","<a href=""http://twitter.com/download/iphone"" r...","RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",False,"{'id_str': '792677689', 'url': None, 'statuses...",
2,,,Sun Aug 07 23:56:11 +0000 2016,"{'urls': [{'indices': [71, 94], 'expanded_url'...","{'media': [{'indices': [116, 139], 'id_str': '...",0,False,,762437211107176451,762437211107176451,...,,,0,False,,"<a href=""http://dlvr.it"" rel=""nofollow"">dlvr.i...",#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,False,"{'id_str': '559496785', 'url': 'https://t.co/w...","Chicago, IL"


In [190]:
#df_redsox_all['retweet_count'].value_counts()

In [8]:
df_whitesox = df_whitesox_all[['text', 'created_at', 'location', 'retweet_count', 'favorite_count']]

In [9]:
df_whitesox.shape

(10000, 5)

In [10]:
df_whitesox.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count
0,"Last weekend Cubs game, this weekend Sox game...",Sun Aug 07 23:59:05 +0000 2016,"Chicago, IL",0,1
1,"RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",Sun Aug 07 23:56:26 +0000 2016,,31,0
2,#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,Sun Aug 07 23:56:11 +0000 2016,"Chicago, IL",0,0
3,@socmediaisdumb @whitesox @FlavaFraz21 b/c he ...,Sun Aug 07 23:55:19 +0000 2016,Jersey..not the guido part,0,0
4,RT @dailyherald: Machado flashes early power a...,Sun Aug 07 23:54:26 +0000 2016,,1,0
5,@DisgustedNYer Yes I've definitely been follow...,Sun Aug 07 23:53:48 +0000 2016,,0,0
6,@mlbtraderumors hey @whitesox take a chance. H...,Sun Aug 07 23:53:27 +0000 2016,Metro City USA,0,2
7,RT @iamchrislane: Great time throwing out the ...,Sun Aug 07 23:52:45 +0000 2016,"Kernersville , NC",76,0
8,Machado flashes early power as Orioles roll ov...,Sun Aug 07 23:52:05 +0000 2016,Chicago Suburbs,1,1
9,@lovepucks @whitesox @CarlosRodon16 you're dumb,Sun Aug 07 23:49:50 +0000 2016,Chiraq,0,0


In [94]:
# DONT USE
lats = []
longs = []
for location in df_redsox['location']:
    g = geocoder.arcgis(location)
    if g.latlng != []:
        lats.append(g.latlng[0])
        longs.append(g.latlng[1])
    else:
        lats.append(np.nan)
        longs.append(np.nan)

In [11]:
unique_locations = df_whitesox['location'].unique()

In [12]:
len(unique_locations)
#unique_locations = unique_locations[:10]
#unique_locations

1867

In [13]:
# USE THIS
location_coords = {}
for location in unique_locations:
    location_coords[location] = geocoder.arcgis(location)

In [14]:
location_coords[''].latlng

[]

In [15]:
def get_lat(x):
    if x.latlng != []:
        return(x.latlng[0])
    else:
        return(np.nan)

In [16]:
def get_long(x):
    if x.latlng != []:
        return(x.latlng[1])
    else:
        return(np.nan)

In [17]:
get_lat(location_coords['Chicago, IL'])

41.85002711900046

In [18]:
df_whitesox['latitude'] = df_whitesox['location'].apply(lambda x: get_lat(location_coords[x]))
df_whitesox['longitude'] = df_whitesox['location'].apply(lambda x: get_long(location_coords[x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [19]:
df_whitesox.head(20)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude
0,"Last weekend Cubs game, this weekend Sox game...",Sun Aug 07 23:59:05 +0000 2016,"Chicago, IL",0,1,41.850027,-87.650046
1,"RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",Sun Aug 07 23:56:26 +0000 2016,,31,0,,
2,#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,Sun Aug 07 23:56:11 +0000 2016,"Chicago, IL",0,0,41.850027,-87.650046
3,@socmediaisdumb @whitesox @FlavaFraz21 b/c he ...,Sun Aug 07 23:55:19 +0000 2016,Jersey..not the guido part,0,0,,
4,RT @dailyherald: Machado flashes early power a...,Sun Aug 07 23:54:26 +0000 2016,,1,0,,
5,@DisgustedNYer Yes I've definitely been follow...,Sun Aug 07 23:53:48 +0000 2016,,0,0,,
6,@mlbtraderumors hey @whitesox take a chance. H...,Sun Aug 07 23:53:27 +0000 2016,Metro City USA,0,2,34.13939,-118.361967
7,RT @iamchrislane: Great time throwing out the ...,Sun Aug 07 23:52:45 +0000 2016,"Kernersville , NC",76,0,36.119858,-80.07365
8,Machado flashes early power as Orioles roll ov...,Sun Aug 07 23:52:05 +0000 2016,Chicago Suburbs,1,1,42.096389,-87.936797
9,@lovepucks @whitesox @CarlosRodon16 you're dumb,Sun Aug 07 23:49:50 +0000 2016,Chiraq,0,0,30.721196,75.172519


In [30]:
#g = geocoder.arcgis(df_redsox['location'][3])
#g.latlng

In [31]:
#df_redsox['location'][:10].apply(geocoder.arcgis)

In [32]:
#print(len(lats))
#print(len(longs))

In [33]:
#df_redsox['latitude'] = pd.Series(lats)
#df_redsox['longitude'] = pd.Series(longs)

In [20]:
df_whitesox['created_at'].dtypes

dtype('O')

In [21]:
df_whitesox['created_at'].head()

0    Sun Aug 07 23:59:05 +0000 2016
1    Sun Aug 07 23:56:26 +0000 2016
2    Sun Aug 07 23:56:11 +0000 2016
3    Sun Aug 07 23:55:19 +0000 2016
4    Sun Aug 07 23:54:26 +0000 2016
Name: created_at, dtype: object

In [22]:
# First, parse to datetime, then get YYYY-MM-DD format.
df_whitesox['time'] = pd.to_datetime(df_whitesox['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')
df_whitesox['time'] = pd.to_datetime(df_whitesox['time'].dt.date, format='%Y-%m-%d')
#date = datetime.strptime('Thu Apr 23 13:38:19 +0000 2009','%a %b %d %H:%M:%S +0000 %Y')
#date = datetime.strftime(date, '%Y %d %m')
#type(date)
#df_redsox['time'].dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [23]:
df_whitesox.head(10)

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time
0,"Last weekend Cubs game, this weekend Sox game...",Sun Aug 07 23:59:05 +0000 2016,"Chicago, IL",0,1,41.850027,-87.650046,2016-08-07
1,"RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",Sun Aug 07 23:56:26 +0000 2016,,31,0,,,2016-08-07
2,#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,Sun Aug 07 23:56:11 +0000 2016,"Chicago, IL",0,0,41.850027,-87.650046,2016-08-07
3,@socmediaisdumb @whitesox @FlavaFraz21 b/c he ...,Sun Aug 07 23:55:19 +0000 2016,Jersey..not the guido part,0,0,,,2016-08-07
4,RT @dailyherald: Machado flashes early power a...,Sun Aug 07 23:54:26 +0000 2016,,1,0,,,2016-08-07
5,@DisgustedNYer Yes I've definitely been follow...,Sun Aug 07 23:53:48 +0000 2016,,0,0,,,2016-08-07
6,@mlbtraderumors hey @whitesox take a chance. H...,Sun Aug 07 23:53:27 +0000 2016,Metro City USA,0,2,34.13939,-118.361967,2016-08-07
7,RT @iamchrislane: Great time throwing out the ...,Sun Aug 07 23:52:45 +0000 2016,"Kernersville , NC",76,0,36.119858,-80.07365,2016-08-07
8,Machado flashes early power as Orioles roll ov...,Sun Aug 07 23:52:05 +0000 2016,Chicago Suburbs,1,1,42.096389,-87.936797,2016-08-07
9,@lovepucks @whitesox @CarlosRodon16 you're dumb,Sun Aug 07 23:49:50 +0000 2016,Chiraq,0,0,30.721196,75.172519,2016-08-07


## Sentiment Analysis ##

In [24]:
df_whitesox.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time
0,"Last weekend Cubs game, this weekend Sox game...",Sun Aug 07 23:59:05 +0000 2016,"Chicago, IL",0,1,41.850027,-87.650046,2016-08-07
1,"RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",Sun Aug 07 23:56:26 +0000 2016,,31,0,,,2016-08-07
2,#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,Sun Aug 07 23:56:11 +0000 2016,"Chicago, IL",0,0,41.850027,-87.650046,2016-08-07
3,@socmediaisdumb @whitesox @FlavaFraz21 b/c he ...,Sun Aug 07 23:55:19 +0000 2016,Jersey..not the guido part,0,0,,,2016-08-07
4,RT @dailyherald: Machado flashes early power a...,Sun Aug 07 23:54:26 +0000 2016,,1,0,,,2016-08-07


In [25]:
from textblob import TextBlob

In [26]:
tweets = df_whitesox['text']
sentiments = []
for tweet in tweets:
    wiki = TextBlob(tweet)
    sentiments.append(wiki.sentiment.polarity)

In [27]:
df_whitesox['sentiment'] = pd.Series(sentiments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [28]:
df_whitesox.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time,sentiment
0,"Last weekend Cubs game, this weekend Sox game...",Sun Aug 07 23:59:05 +0000 2016,"Chicago, IL",0,1,41.850027,-87.650046,2016-08-07,-0.06
1,"RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",Sun Aug 07 23:56:26 +0000 2016,,31,0,,,2016-08-07,0.75
2,#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,Sun Aug 07 23:56:11 +0000 2016,"Chicago, IL",0,0,41.850027,-87.650046,2016-08-07,0.375
3,@socmediaisdumb @whitesox @FlavaFraz21 b/c he ...,Sun Aug 07 23:55:19 +0000 2016,Jersey..not the guido part,0,0,,,2016-08-07,0.7
4,RT @dailyherald: Machado flashes early power a...,Sun Aug 07 23:54:26 +0000 2016,,1,0,,,2016-08-07,0.1


In [223]:
#df_redsox.to_csv('df_redsox.csv')

In [29]:
g = geocoder.arcgis('Chicago, IL')
chicago_lat = g.latlng[0]
chicago_long = g.latlng[1]
print(chicago_lat, chicago_long)

41.85002711900046 -87.65004618299969


In [30]:
import gpxpy.geo
import math
haversine_distances = []
for i in range(df_whitesox.shape[0]):
#dist = gpxpy.geo.haversine_distance(52.2296756, 21.0122287, 52.406374, 16.9251681)
#print(dist)
    if math.isnan(df_whitesox.iloc[i]['latitude']):
        haversine_distances.append(np.nan)
    else:
        distance = gpxpy.geo.haversine_distance(chicago_lat, chicago_long, df_whitesox.iloc[i]['latitude'], df_whitesox.iloc[i]['longitude'])
        haversine_distances.append(distance)

In [31]:
len(haversine_distances)

10000

In [32]:
df_whitesox['hav_distance'] = pd.Series(haversine_distances)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [33]:
df_whitesox.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,time,sentiment,hav_distance
0,"Last weekend Cubs game, this weekend Sox game...",Sun Aug 07 23:59:05 +0000 2016,"Chicago, IL",0,1,41.850027,-87.650046,2016-08-07,-0.06,0.0
1,"RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",Sun Aug 07 23:56:26 +0000 2016,,31,0,,,2016-08-07,0.75,
2,#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,Sun Aug 07 23:56:11 +0000 2016,"Chicago, IL",0,0,41.850027,-87.650046,2016-08-07,0.375,0.0
3,@socmediaisdumb @whitesox @FlavaFraz21 b/c he ...,Sun Aug 07 23:55:19 +0000 2016,Jersey..not the guido part,0,0,,,2016-08-07,0.7,
4,RT @dailyherald: Machado flashes early power a...,Sun Aug 07 23:54:26 +0000 2016,,1,0,,,2016-08-07,0.1,


In [34]:
df_whitesox.rename(columns={'time': 'dates'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [35]:
df_whitesox.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,dates,sentiment,hav_distance
0,"Last weekend Cubs game, this weekend Sox game...",Sun Aug 07 23:59:05 +0000 2016,"Chicago, IL",0,1,41.850027,-87.650046,2016-08-07,-0.06,0.0
1,"RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",Sun Aug 07 23:56:26 +0000 2016,,31,0,,,2016-08-07,0.75,
2,#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,Sun Aug 07 23:56:11 +0000 2016,"Chicago, IL",0,0,41.850027,-87.650046,2016-08-07,0.375,0.0
3,@socmediaisdumb @whitesox @FlavaFraz21 b/c he ...,Sun Aug 07 23:55:19 +0000 2016,Jersey..not the guido part,0,0,,,2016-08-07,0.7,
4,RT @dailyherald: Machado flashes early power a...,Sun Aug 07 23:54:26 +0000 2016,,1,0,,,2016-08-07,0.1,


In [36]:
outcome_by_date = {
    '2016-08-07 00:00:00': 'Loss',
    '2016-08-08 00:00:00': 'No Game',
    '2016-08-09 00:00:00': 'Win',
    '2016-08-10 00:00:00': 'Loss',
    '2016-08-11 00:00:00': 'Loss',
    '2016-08-12 00:00:00': 'Win',
    '2016-08-13 00:00:00': 'Win',
    '2016-08-14 00:00:00': 'Loss',
    '2016-08-15 00:00:00': 'No Game',
    '2016-08-16 00:00:00': 'Loss'
}

In [37]:
# Apply outcome values to new column.
df_whitesox['outcome'] = df_whitesox['dates'].apply(lambda x: outcome_by_date[str(x)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [38]:
df_whitesox.head()

Unnamed: 0,text,created_at,location,retweet_count,favorite_count,latitude,longitude,dates,sentiment,hav_distance,outcome
0,"Last weekend Cubs game, this weekend Sox game...",Sun Aug 07 23:59:05 +0000 2016,"Chicago, IL",0,1,41.850027,-87.650046,2016-08-07,-0.06,0.0,Loss
1,"RT @whitesox: Happy birthday, Tommy Kahnle! Sh...",Sun Aug 07 23:56:26 +0000 2016,,31,0,,,2016-08-07,0.75,,Loss
2,#Autograph GORDON BECKHAM #SignED #Chicago #Wh...,Sun Aug 07 23:56:11 +0000 2016,"Chicago, IL",0,0,41.850027,-87.650046,2016-08-07,0.375,0.0,Loss
3,@socmediaisdumb @whitesox @FlavaFraz21 b/c he ...,Sun Aug 07 23:55:19 +0000 2016,Jersey..not the guido part,0,0,,,2016-08-07,0.7,,Loss
4,RT @dailyherald: Machado flashes early power a...,Sun Aug 07 23:54:26 +0000 2016,,1,0,,,2016-08-07,0.1,,Loss


In [39]:
df_whitesox.to_csv('df_whitesox.csv')