## Post-processing for the sports tweets

- sportsn.csv is a csv file from main analysis running in cPouta containing the geoparsed tweets
- geotaggedn.csv is a csv file from locally done analysis for geotagged tweets

## Importing data

In [18]:
#import modules
import glob
from shapely.geometry import Point
import pandas as pd
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster
from pyproj import CRS

In [19]:
#read the geoparsed tweets to one gdf
names = []

geodf = gpd.GeoDataFrame()

for name in glob.glob(r"sports*.csv"):
    df = pd.read_csv(name)
    names.append(name)
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))
    geodf = geodf.append(gdf)

In [20]:
#delete the geocoded tweets from this df, they were handled separately
geodf = geodf[(geodf['geom'].isna())]
geodf

Unnamed: 0.1,Unnamed: 0,full_text,geom,geometry,lang,lat,lemma_text,lemmas,lon
0,67,Trench run... #suomenlinna #fb http://t.co/MRE...,,POINT (388272.705 6669477.595),en,6.669478e+06,trench run ... suomenlinna fb http://t.co/mre...,"['trench', 'run', '...', 'suomenlinna', 'fb', ...",388272.705362
1,9033,I turned 40 years this week so I stretched tod...,,POINT (366700.059 6684680.631),en,6.684681e+06,I turn 40 year this week so I stretch today '...,"['I', 'turn', '40', 'year', 'this', 'week', 's...",366700.059287
2,10613,I wish it gets cold enough and the snow stays ...,,POINT (370853.565 6681537.224),en,6.681537e+06,I wish it get cold enough and the snow stay h...,"['I', 'wish', 'it', 'get', 'cold', 'enough', '...",370853.564617
3,11209,"@aksalmi yep, it's not that far away this year...",,POINT (391713.120 6685777.801),en,6.685778e+06,"@aksalmi yep , it 's not that far away this y...","['@aksalmi', 'yep', ',', 'it', ""'s"", 'not', 't...",391713.120360
4,11554,Celebrated 10.10.10 by running my third marath...,,POINT (391713.120 6685777.801),en,6.685778e+06,celebrated 10.10.10 by run my third marathon ...,"['celebrated', '10.10.10', 'by', 'run', 'my', ...",391713.120360
...,...,...,...,...,...,...,...,...,...
271,490521,Kettu juoksi vastaan Eiranrannassa. #helsinki,,POINT (385446.347 6672081.187),fi,6.672081e+06,kettu juosta vastaan eiranranta . helsinki,"['kettu', 'juosta', 'vastaan', 'eiranranta', '...",385446.347494
272,495702,"Oli HJK:lla paikkoja aikaisemminkin, mutta kyl...",,POINT (385446.347 6672081.187),fi,6.672081e+06,"olla HJK paikka aikaisemmin , mutta kylläpä s...","['olla', 'HJK', 'paikka', 'aikaisemmin', ',', ...",385446.347494
273,496585,Eilen Helsingissä t-paita päällä. Tänään jalka...,,POINT (385446.347 6672081.187),fi,6.672081e+06,eilen Helsinki Tpaita päällä . tänään jalkapa...,"['eilen', 'Helsinki', 'Tpaita', 'päällä', '.',...",385446.347494
274,498189,Osa näistä ruotsalaiskannattajista juoksee Hel...,,POINT (385446.347 6672081.187),fi,6.672081e+06,osa tämä ruotsalaiskannattaja juosta Helsinki...,"['osa', 'tämä', 'ruotsalaiskannattaja', 'juost...",385446.347494


In [21]:
#import geotagged tweets 
for name in glob.glob(r"geotagged*.csv"):
    df = pd.read_csv(name)
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))
    #convert to epsg 3067 to match the crs of geoparsed tweets
    gdf.crs = CRS.from_epsg(4326)
    gdf = gdf.to_crs(3067)
    #append to geoparsed tweets
    geodf = geodf.append(gdf)
    #geodf.reset_index(inplace=True)

geodf

Unnamed: 0.1,Unnamed: 0,full_text,geom,geometry,lang,lat,lemma_text,lemmas,lon
0,67,Trench run... #suomenlinna #fb http://t.co/MRE...,,POINT (388272.705 6669477.595),en,6.669478e+06,trench run ... suomenlinna fb http://t.co/mre...,"['trench', 'run', '...', 'suomenlinna', 'fb', ...",388272.705362
1,9033,I turned 40 years this week so I stretched tod...,,POINT (366700.059 6684680.631),en,6.684681e+06,I turn 40 year this week so I stretch today '...,"['I', 'turn', '40', 'year', 'this', 'week', 's...",366700.059287
2,10613,I wish it gets cold enough and the snow stays ...,,POINT (370853.565 6681537.224),en,6.681537e+06,I wish it get cold enough and the snow stay h...,"['I', 'wish', 'it', 'get', 'cold', 'enough', '...",370853.564617
3,11209,"@aksalmi yep, it's not that far away this year...",,POINT (391713.120 6685777.801),en,6.685778e+06,"@aksalmi yep , it 's not that far away this y...","['@aksalmi', 'yep', ',', 'it', ""'s"", 'not', 't...",391713.120360
4,11554,Celebrated 10.10.10 by running my third marath...,,POINT (391713.120 6685777.801),en,6.685778e+06,celebrated 10.10.10 by run my third marathon ...,"['celebrated', '10.10.10', 'by', 'run', 'my', ...",391713.120360
...,...,...,...,...,...,...,...,...,...
4,4,To #California it is. #business is going to #g...,0101000020E6100000541B5A6CACCF38409A000D808222...,POINT (378924.860 6683445.954),en,6.026961e+01,to # California it be . business be go to glo...,"['to', '#', 'California', 'it', 'be', '.', 'bu...",24.811225
5,5,#Repost elmobistro \n ···\nNew Seagulls table ...,0101000020E6100000AC0C99E1F5EF3840B8CF85EC1716...,POINT (385562.235 6672421.377),en,6.017261e+01,# repost elmobistro \n ··· \n New seagull ta...,"['#', 'repost', 'elmobistro', '\n ', '···', '\...",24.937346
6,6,"Etätyössä parasta: sopivasti liikuntaa, tervee...",0101000020E61000003B40E6B974ED3840BFF51FA17F15...,POINT (385003.328 6671920.931),fi,6.016796e+01,"etätyö hyvä : sopivasti liikunta , terveellin...","['etätyö', 'hyvä', ':', 'sopivasti', 'liikunta...",24.927562
7,7,Urheilu ja musa #u20fi #Lauantai http://t.co/7...,0101000020E61000001E38252897083940CAB97FF29825...,POINT (391300.943 6685744.389),fi,6.029373e+01,urheilu ja musa u20fi # lauantai http://t.co/...,"['urheilu', 'ja', 'musa', 'u20fi', '#', 'lauan...",25.033556


## Removing city geotags

In [22]:
def delete_city_geotags(geodf):
    """
    The posts tagged to a city geotag create artificial hotspots and should be removed for statistical analyses.
    
    Parameters:
    geodf| String: name of the geodataframe holding geocoded tweets
    """
    
    Helsinki_point = Point(385446.3474936858, 6672081.187213039)
    Espoo_point = Point(370853.5646174778, 6681537.223923998)
    Vantaa_point = Point(391713.1203601367, 6685777.800663358)
    
    geodf = geodf.drop(geodf[(geodf["geometry"]== Helsinki_point) | (geodf["geometry"]== Espoo_point) | (geodf["geometry"]== Vantaa_point)].index)
    
    geodf.reset_index(inplace= True)
    return geodf


In [23]:
citynames_del = delete_city_geotags(geodf)
citynames_del

Unnamed: 0.1,index,Unnamed: 0,full_text,geom,geometry,lang,lat,lemma_text,lemmas,lon
0,324,398987,"Lehmät käyttäytyy oudosti, tuijottavat ja sit ...",,POINT (380887.070 6696292.263),fi,6.696292e+06,"lehmä käyttäytyy oudosti , tuijottaa ja sitte...","['lehmä', 'käyttäytyy', 'oudosti', ',', 'tuijo...",380887.070009
1,324,138819,Kuntosalille Hakunilan uimahalliin http://t.co...,,POINT (395112.392 6683904.373),fi,6.683904e+06,kuntosali hakunila uimahalli http://t.co/LhiM...,"['kuntosali', 'hakunila', 'uimahalli', 'http:/...",395112.391609
2,379,451865,JES! Lauantaina Otaniemeen kattoo ku @WestendI...,,POINT (379530.879 6673879.195),fi,6.673879e+06,JES ! lauantai Otaniemi katto kun feestendind...,"['JES', '!', 'lauantai', 'Otaniemi', 'katto', ...",379530.879264
3,383,452905,Edarin treenipelit jatkuvat. Tänään vastassa I...,,POINT (392938.143 6677710.445),fi,6.677710e+06,edari treenipeli jatkua . tänään vasta IHS . ...,"['edari', 'treenipeli', 'jatkua', '.', 'tänään...",392938.143094
4,384,452922,Vantaan Energia Areenasta EräViikinkien kotiha...,,POINT (391713.120 6685777.801),fi,6.685778e+06,Vantaa energia areena Eräviikinki kotihalli ....,"['Vantaa', 'energia', 'areena', 'Eräviikinki',...",391713.120360
...,...,...,...,...,...,...,...,...,...,...
14519,1341,1341,@rinsku82 tosi sitkeetä tauti. Itellä joulukuu...,0101000020E6100000963F4241E7CF3840974362EDB520...,POINT (378922.596 6681879.486),fi,6.025555e+01,@rinsku82 tosi sitkee tauti . ittellä jouluku...,"['@rinsku82', 'tosi', 'sitkee', 'tauti', '.', ...",24.812122
14520,1342,1342,@rinsku82 Kiitos! Liikunnasta saa ainakin lisä...,0101000020E6100000BF62CBFAE6CF38401A47510CB520...,POINT (378922.264 6681876.507),fi,6.025553e+01,@rinsku82 kiitos ! liikunta saada ainakin lis...,"['@rinsku82', 'kiitos', '!', 'liikunta', 'saad...",24.812118
14521,1343,1343,Fillari kaivettu kellarista ja kevään ensimmäi...,0101000020E6100000E5BEEED711C6384043C0A58C4D1D...,POINT (376697.344 6678986.700),fi,6.022893e+01,fillari kaivaa kellari ja kevät ensimmäinen l...,"['fillari', 'kaivaa', 'kellari', 'ja', 'kevät'...",24.773710
14522,1344,1344,Tämä on jalkapalloa - ei futista! Lajin kunink...,0101000020E6100000B5ADA0487BC63840C994EE32EC1F...,POINT (376863.270 6681262.225),fi,6.024940e+01,tämä olla jalkapallo - ei futti ! laji kunink...,"['tämä', 'olla', 'jalkapallo', '-', 'ei', 'fut...",24.775319


## Saving sportslist to a column

In [None]:
sportslist_all = ["running", "run", "walk", "walking", "jog" ,"jogging", "hike", "hiking", "trek", "trekking", 
                  "bicycle", "bike", "biking","cycling", "exercise", "exercising", "ski", "skiing", "skate", "skating", 
                  "workout", "training", "sport", "sporting", "canoe", "canoeing", "ice-hockey", "basketball",
                 "hockey", "football", "tennis", "dance", "dancing", "rowing", "sweat", "sweating", "badminton",
                 "floorball", "volley", "volleyball",  "beach volley", "yoga","swimming", "swim", "sail", "sailing", "kayak", "kayaking", "squash", "tabletennis",
                "kävely", "kävellä", "käveleminen" "juoksu", "juosta", "juokseminen", "hiihto", "hiihtää", "hiihtäminen",
                 "lenkki", "lenkkeily", "lenkkeillä", "treenata", "treenaaminen", "urheilla", "meloa", "melonta", "soutaa", 
                 "soutaminen", "patikointi", "patikoida", "patikoiminen",  
                  "treeni", "urheilu", "liikunta", "pyörä", "pyöräily", "pyöräillä", "pyöräileminen", "jääkiekko", "hockey",
                  "jalkapallo", "tennis", "tanssi", "tanssia", "tanssiminen", "hiki", "hikoilla", "sulkapallo",
                     "sähly", "salibandy", "lentopallo",  "lentis", "luistella", "luisteleminen", "luistelu", "kuntosali",          
                 "koris","futis", "koripallo", "uinti", "uida", "uiminen", "kajakki", "pujehtia", 
                 "purjehdus", "lätkä", "jooga", "squash", "kössi", "pingis", "pöytätennis","jooksmine", "jooksma", 
                 "jooks", "kõndimine", "kõnd", "kõndima", "jalutama", "jalutus", 
                 "jalutamine", "sörkimine", "sörkima", "sörk", "sörksjooks", "matk", "matkamine", "matkama",
                   "jalgratas", "jalgrattasõit", "rattasõit", "treening", "treenima", "võimlema", "võimlemine", 
                 "uisutamine", "uisutama", "suusatama", "suusatamine", "sportima", "sportimine", "trenn", "sport", 
                 "jõusaal", "võimla", "spordihall", "spordisaal", "korvpall", "koss", "kanuu",  "kanuutama", 
                 "kanuutamine", "kanuusõit", "jäähoki", "hoki", "jalgpall", "jalka", "tennis", "tants", 
                 "tantsimine", "tantsima", "sõudmine", "sõudma", "aerutama", "aerutamine", "higi", "higistama", 
                 "higistamine", "sulgpall", "bädminton", "saalihoki", "volle", "rannavolle", "võrkpall", 
                 "rannavõrkpall", "joogatama", "jooga", "ujuma", "ujumine", "meresüst",  "kajakisõit", "purjetama", 
                 "purjetamine", "squash", "seinatennis", "lauatennis"]
   

    
for i, row in geodf.iterrows():
    try:
        sports_list= [lemma for lemma in geodf["lemmas"][i] if(lemma in sportslist_all)]
        geodf["sports"][i] = sports_list
        
    except KeyError:
        print("avainerrori")
        
geodf

## OLS testing

- this will be done with a separate R script

In [1]:
import pandas.plotting
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn


data = gpd.read_file(r"Geotag_pip.shp")

ImportError: dlopen(/Users/kosokoso/opt/anaconda3/envs/nlp/lib/python3.7/site-packages/scipy/special/_ufuncs.cpython-37m-darwin.so, 2): Library not loaded: @rpath/libgfortran.3.dylib
  Referenced from: /Users/kosokoso/opt/anaconda3/envs/nlp/lib/python3.7/site-packages/scipy/special/_ufuncs.cpython-37m-darwin.so
  Reason: image not found

In [None]:
data.drop([0], axis=0, inplace=True)
pandas.plotting.scatter_matrix(data[['NUMPOINTS', 'NUMfacilit', 'mediaanitu', "hi_edu"]])


In [None]:
model = ols("NUMPOINTS ~ NUMfacilit", data).fit()
print(model.summary())

In [None]:
seaborn.pairplot(data, vars=['NUMPOINTS', 'NUMfacilit'],kind='reg')
seaborn.lmplot(y='NUMPOINTS', x='NUMfacilit', data=data)

In [None]:
stdres = pd.DataFrame(model.resid_pearson)
plt.plot(stdres, 'o', ls = 'None')
l = plt.axhline(y=0, color = 'r')