# Process MX data to count number of tweets per municipality

Processes and cleans the data and then computes tweet statistics (nr of tweets, users etc.) for each Mexican municipality

CREATES:
- ID file for all included tweets/users with link to muncipality of tweet and user
- Tweet level data file
- Muncipality level summary statistics for nr of tweets, users etc.

In [1]:
################################################################################
# Import modules
################################################################################
import os
import dask
import dask.bag as db
from dask.diagnostics import ProgressBar
import dask.dataframe as dd
import swifter

import pandas as pd
import numpy as np
import statistics as st

import json
import pprint
import geopandas
import glob

import time
import datetime
import pytz



In [2]:
start = time.time()

In [3]:
# Set utilities
pp = pprint.PrettyPrinter(indent=4)
p_print = pp.pprint
pd.options.display.max_colwidth = 100

In [4]:
################################################################################
# Specify paths
################################################################################
os.chdir("../..")
mode = "full" # sample
if mode == "full":
    input_path = "PATH TO FOLDER"
    output_path = "PATH TO FOLDER"
if mode == "sample":
    input_path = "PATH TO FOLDER"
    output_path = "PATH TO FOLDER"
geo_path = "PATH TO FOLDER"

In [5]:
hours = len(glob.glob(input_path))
print(f"Tweets for {hours} hours = {round(hours/24, 1)} days = {round(hours/24/30,1)} months!")

Tweets for 1489 hours = 62.0 days = 2.1 months!


## 1. Create dask dataframe

In [6]:
tweets = db.read_text(input_path) \
            .filter(lambda x: x != '\n') \
            .filter(lambda x: "created_at" in x) \
            .map(lambda x: x.replace("\n", "")) \
            .map(json.loads)

tweet_info = tweets.map(lambda x: [x["id_str"],
                                   x["created_at"],
                                   x["extended_tweet"]["full_text"] if "extended_tweet" in x else x["text"],
                                   x["user"]["id_str"],
                                   x["user"]["followers_count"],
                                   x["user"]["friends_count"],
                                   x["user"]["statuses_count"],
                                   x["user"]["verified"],
                                   x["user"]["description"],
                                   x["user"]["created_at"],
                                   x["place"]["country_code"] if x["place"]!= None else None,
                                   x["coordinates"]["coordinates"] if x["coordinates"] != None else None,
                                   x["place"]["place_type"] if x["place"] != None else None,
                                   x["place"]["name"] if x["place"] != None else None,
                                   x["place"]["bounding_box"]["coordinates"][0] if x["place"] != None else None,
                                   x["is_quote_status"],
                                   x["user"]["is_translator"],
                                   x["source"],
                                   x["lang"]
                                  ])


tweet_dd = tweet_info.to_dataframe(columns = ["id",
                                              "created_at",
                                              "tweet",
                                              "user_id",
                                              "user_followers",
                                              "user_friends",
                                              "user_statuses",
                                              "user_verified",
                                              "user_description",
                                              "user_created_at",
                                              "country_code",
                                              "coordinates",
                                              "place_type",
                                              "place_name",
                                              "bounding_box",
                                              "is_quote_status",
                                              "is_translator",
                                              "source",
                                              "language"
                                             ])



# Keep only English tweets in the US (and drop translators)
tweet_dd = tweet_dd[(tweet_dd["country_code"] == "MX") & (tweet_dd["language"] == "es")]
tweet_dd = tweet_dd[tweet_dd["is_translator"]==False]

tweet_dd = tweet_dd.drop(["is_translator", "country_code", "language"], axis=1)


## 2. Inspect, clean and prepare georeferenced data

### 2.1 Clean and inspect geodata of tweets

In [7]:
# Inspect place types
place_types = tweet_dd.place_type.value_counts().compute(scheduler='processes')
print(place_types)

city            2700041
poi               43710
admin             24727
neighborhood       8264
country            7909
Name: place_type, dtype: int64


In [8]:
# Check number of tweets with exact coordinates
tweet_dd["coordinates"].notnull().sum().compute(scheduler='processes')

135399

In [9]:
# Check if admin is state or municipality
tweet_dd.loc[tweet_dd.place_type == "admin", "place_name"].head(20) # = state

5739               México
7122              Yucatán
9857               México
11495        Quintana Roo
12082        Quintana Roo
13925              México
21713    Distrito Federal
22517    Distrito Federal
25866        Quintana Roo
26076        Quintana Roo
27716        Quintana Roo
29360        Quintana Roo
29995        Quintana Roo
30609    Distrito Federal
31996    Distrito Federal
34973              México
35190        Quintana Roo
36821              México
37152    Distrito Federal
37456        Quintana Roo
Name: place_name, dtype: object

In [10]:
# Drop country and state level data
tweet_dd = tweet_dd[(tweet_dd["place_type"].isin(["country", "admin"]) == False) |
                    tweet_dd["coordinates"].notnull()]
tweet_dd.loc[tweet_dd["coordinates"].isnull(), "place_type"].value_counts().compute(scheduler='processes')

city            2565057
poi               43710
neighborhood       8242
Name: place_type, dtype: int64

In [11]:
# Compute center of coordinate bounding box
def get_center_coords(x):

    if x != None:
        long1 = max(row[0] for row in x)
        long2 = min(row[0] for row in x)
        lat1 = max(row[1] for row in x)
        lat2 = min(row[1] for row in x)

        long_cent = (long1 + long2)/2
        lat_cent = (lat1 + lat2)/2
        return [long_cent, lat_cent]

    else:
        return [None, None] # Return None when no coordinate box available

In [12]:
tweet_dd["approx_coords"] = tweet_dd["bounding_box"].apply(get_center_coords, meta=list)

In [13]:
# Get best coordinates: exact coordinates if available, else center of bounding box
tweet_dd["best_coords"] = tweet_dd.apply(lambda x: x["coordinates"] if x["coordinates"] != None
                                        else x["approx_coords"], axis=1)
tweet_dd.head()

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



Unnamed: 0,id,created_at,tweet,user_id,user_followers,user_friends,user_statuses,user_verified,user_description,user_created_at,coordinates,place_type,place_name,bounding_box,is_quote_status,source,approx_coords,best_coords
2,1421591477897945088,Sat Jul 31 21:59:54 +0000 2021,"Una gran experiencia en estas sesiones de fotos en la mágica Santorini, Grecia 🇬🇷 https://t.co/p...",1394205255076823042,361,852,2955,False,"Reflexiones, Poemas, Paisajes, ciudades, lo cotidiano de la vida, naturaleza #CDMX #SéFeliz sígu...",Mon May 17 08:17:13 +0000 2021,,city,José Azueta,"[[-101.714639, 17.540426], [-101.714639, 18.060683], [-101.199391, 18.060683], [-101.199391, 17....",False,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","[-101.45701500000001, 17.8005545]","[-101.45701500000001, 17.8005545]"
28,1421591486269960196,Sat Jul 31 21:59:56 +0000 2021,@iCamilxx @armandofo90 Con gusto siempre que necesites hablar … aquí me encuentras! #abrazote,1086511899062226944,256,199,5369,False,"Actor, Médico (Urgencias-Critica-Hiperbárica), loco, patético, cantante, soñador y medio fácil! ...",Sat Jan 19 06:33:17 +0000 2019,,city,Ecatepec de Morelos,"[[-99.118665, 19.484399], [-99.118665, 19.652161], [-98.970224, 19.652161], [-98.970224, 19.4843...",False,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","[-99.0444445, 19.56828]","[-99.0444445, 19.56828]"
30,1421591486118903810,Sat Jul 31 21:59:56 +0000 2021,Buen provecho https://t.co/XO9bN3CZEt,1457811133,93,101,6012,False,Live fast die young be wild and have fun.,Sat May 25 18:58:41 +0000 2013,,poi,Contramar,"[[-99.167145, 19.419785], [-99.167145, 19.419785], [-99.167145, 19.419785], [-99.167145, 19.4197...",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-99.167145, 19.419785]","[-99.167145, 19.419785]"
37,1421591489608617986,Sat Jul 31 21:59:57 +0000 2021,Gordas con su Apple Watch que sólo les sirve de relog porque en su puta vida han corrido ni 100 ...,222512654,594,1227,14572,False,Aspirante a sibarita!\r\n ...,Fri Dec 03 16:52:34 +0000 2010,,city,Mérida,"[[-89.798197, 20.695104], [-89.798197, 21.186966], [-89.449025, 21.186966], [-89.449025, 20.6951...",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-89.62361100000001, 20.941035]","[-89.62361100000001, 20.941035]"
39,1421591490619494402,Sat Jul 31 21:59:57 +0000 2021,Jamás dejaría que mi lealtad y respeto me fallarán de algún modo a mi.,132549316,61,202,575,False,¡life is a roaller coaster!,Tue Apr 13 14:59:41 +0000 2010,,city,Xochimilco,"[[-99.160023, 19.152878], [-99.160023, 19.319611], [-99.003557, 19.319611], [-99.003557, 19.1528...",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-99.08179, 19.236244499999998]","[-99.08179, 19.236244499999998]"


In [14]:
# Inspect nr of non-missing observations per column
tweet_dd.count().compute(scheduler='processes')

id                  2752408
created_at          2752408
tweet               2752408
user_id             2752408
user_followers      2752408
user_friends        2752408
user_statuses       2752408
user_verified       2752408
user_description    2397962
user_created_at     2752408
coordinates          135399
place_type          2752408
place_name          2752408
bounding_box        2752408
is_quote_status     2752408
source              2752408
approx_coords       2752408
best_coords         2752408
dtype: int64

### 2.2 Convert tweet data to geodataframe

In [15]:
# Keep only data with coordinates
tweet_dd = tweet_dd[~tweet_dd['best_coords'].isna()]
tweet_dd['long'] = tweet_dd['best_coords'].apply(lambda x: x[0], meta=float)
tweet_dd['lat'] = tweet_dd['best_coords'].apply(lambda x: x[1], meta=float)

In [16]:
# Convert to geo dataframe
df = tweet_dd.compute(scheduler='processes')
df = geopandas.GeoDataFrame(
    tweet_dd.compute(scheduler='processes'), geometry=geopandas.points_from_xy(df.long, df.lat))

In [17]:
# Import shp-file on MX municipalities
muns = geopandas.read_file(geo_path + "/mex_admbnda_adm2_govmex_20210618.shp")
muns.rename({"ADM2_ES" : "NAME",
           "ADM2_PCODE" : "GEOID"}, axis=1, inplace=True)

In [18]:
# Change projection (to make map look better)
df.crs = {'init': 'epsg:4326', 'no_defs': True}
df = df.to_crs("EPSG:5071")

print(muns.crs)
muns = muns.to_crs("EPSG:5071")

epsg:4326


In [19]:
# Mark municipalities where centroid lies in other municipality
muns["centroid_bb"] = muns["geometry"].envelope.centroid
muns["centroid_in_mun"] = muns["geometry"].contains(muns["centroid_bb"] )
muns["centroid_in_mun"].value_counts()
muns.loc[muns["centroid_in_mun"] == False, ["NAME", "ADM1_ES"]].head()

Unnamed: 0,NAME,ADM1_ES
12,Acanceh,Yucatán
20,Acatlán,Puebla
63,Aldama,Chiapas
83,Alvarado,Veracruz de Ignacio de la Llave
84,Álvaro Obregón,Distrito Federal


In [20]:
# Mark municipalities with non-unique names (same names in different states)
muns["name_count"] = muns.groupby("NAME")["NAME"].transform("count")
print(muns["name_count"].value_counts())

1    2232
2     116
3      39
4      32
5      25
7       7
6       6
Name: name_count, dtype: int64


In [21]:
muns.groupby(["ADM1_ES", "NAME"])["NAME"].transform("count").value_counts() # Unique within states

1    2457
Name: NAME, dtype: int64

### 2.3 Join with municipality dataset

In [22]:
# 1. Join with municipality that contains centroid of bounding box (problem if mun level precision and centroid not in mun)
print(len(df))
muns.geometry.name = "mun_polygon"
df = geopandas.sjoin(df, muns[["GEOID", "NAME", "ADM1_ES", "centroid_in_mun", "geometry"]],
                     how='left', op='within')
df.reset_index(drop=True, inplace=True)
df.drop("index_right", inplace=True, axis=1)
print(df.head(4))
print(df.columns)
len(df)

2752408
                    id                      created_at  \
0  1421591477897945088  Sat Jul 31 21:59:54 +0000 2021   
1  1421591486269960196  Sat Jul 31 21:59:56 +0000 2021   
2  1421591486118903810  Sat Jul 31 21:59:56 +0000 2021   
3  1421591489608617986  Sat Jul 31 21:59:57 +0000 2021   

                                                                                                 tweet  \
0  Una gran experiencia en estas sesiones de fotos en la mágica Santorini, Grecia 🇬🇷 https://t.co/p...   
1        @iCamilxx @armandofo90 Con gusto siempre que necesites hablar … aquí me encuentras! #abrazote   
2                                                                Buen provecho https://t.co/XO9bN3CZEt   
3  Gordas con su Apple Watch que sólo les sirve de relog porque en su puta vida han corrido ni 100 ...   

               user_id  user_followers  user_friends  user_statuses  \
0  1394205255076823042             361           852           2955   
1  1086511899062226944      

2752408

In [23]:
# Check tweets that were not merged
print(len(df[df["GEOID"].isna()]))
df.loc[df["GEOID"].isna(), ["place_type", "place_name"]].value_counts().head(10) # Mostly coastal places


35531


place_type  place_name  
city        Boca del Río    18364
            Guaymas          6049
            Carmen           5323
            Alvarado         2027
            Huatabampo        705
            Champotón         609
            Escuinapa         415
            San Blas          275
            Tampico Alto      220
            Cozumel           202
dtype: int64

In [24]:
# Rename columns
df.rename({"GEOID": "mun_id_within",
           "NAME": "mun_name_within",
           "ADM1_ES": "state_id_within",
           'centroid_in_mun': "centroid_in_mun_within"}, axis=1, inplace=True)
df.geometry.name = "best_coords"
df.columns

Index(['id', 'created_at', 'tweet', 'user_id', 'user_followers',
       'user_friends', 'user_statuses', 'user_verified', 'user_description',
       'user_created_at', 'coordinates', 'place_type', 'place_name',
       'bounding_box', 'is_quote_status', 'source', 'approx_coords',
       'best_coords', 'long', 'lat', 'geometry', 'mun_id_within',
       'mun_name_within', 'state_id_within', 'centroid_in_mun_within'],
      dtype='object')

In [25]:
# 2. Join with municipality with closest bounding box centroid (only correct if municipality level precision)
muns = muns.set_geometry('centroid_bb')


In [26]:
print(len(df))
df = geopandas.sjoin_nearest(df, muns[["GEOID",
                                       "NAME",
                                       "ADM1_ES",
                                       "centroid_in_mun",
                                       "centroid_bb"]].set_geometry('centroid_bb'))
df.reset_index(drop=True, inplace=True)
print(df.head(4))
len(df)

2752408
                    id                      created_at  \
0  1421591477897945088  Sat Jul 31 21:59:54 +0000 2021   
1  1421592908705058818  Sat Jul 31 22:05:36 +0000 2021   
2  1421599302762176515  Sat Jul 31 22:31:00 +0000 2021   
3  1421601815422902272  Sat Jul 31 22:40:59 +0000 2021   

                                                                                                 tweet  \
0  Una gran experiencia en estas sesiones de fotos en la mágica Santorini, Grecia 🇬🇷 https://t.co/p...   
1             Un pequeño recorrido por la majestuosa Ciudad de México #CDMX 🇲🇽 https://t.co/YERY5SPYFq   
2  @fab_vazquez @CamMttz Que se vuelva a hacer la prueba, que se atienda con un especialista y guar...   
3                                        Nunca te quedes con las ganas de nada https://t.co/wNWheL6Mo0   

               user_id  user_followers  user_friends  user_statuses  \
0  1394205255076823042             361           852           2955   
1  1394205255076823042      

2752408

In [27]:
df.columns

Index(['id', 'created_at', 'tweet', 'user_id', 'user_followers',
       'user_friends', 'user_statuses', 'user_verified', 'user_description',
       'user_created_at', 'coordinates', 'place_type', 'place_name',
       'bounding_box', 'is_quote_status', 'source', 'approx_coords',
       'best_coords', 'long', 'lat', 'geometry', 'mun_id_within',
       'mun_name_within', 'state_id_within', 'centroid_in_mun_within',
       'index_right', 'GEOID', 'NAME', 'ADM1_ES', 'centroid_in_mun'],
      dtype='object')

In [28]:
# Rename columns
df.rename({"GEOID": "mun_id_nearest",
           "NAME": "mun_name_nearest",
           "ADM1_ES": "state_id_nearest",
           'centroid_in_mun': "centroid_in_mun_nearest"}, axis=1, inplace=True)
df.columns

Index(['id', 'created_at', 'tweet', 'user_id', 'user_followers',
       'user_friends', 'user_statuses', 'user_verified', 'user_description',
       'user_created_at', 'coordinates', 'place_type', 'place_name',
       'bounding_box', 'is_quote_status', 'source', 'approx_coords',
       'best_coords', 'long', 'lat', 'geometry', 'mun_id_within',
       'mun_name_within', 'state_id_within', 'centroid_in_mun_within',
       'index_right', 'mun_id_nearest', 'mun_name_nearest', 'state_id_nearest',
       'centroid_in_mun_nearest'],
      dtype='object')

In [29]:
# Check and clean place types
df.loc[df["coordinates"].notnull(), "place_type"] = "exact"
df.loc[df["coordinates"].notnull(), "place_name"] = np.nan
print(df.place_type.value_counts())
df.place_type.isna().sum() # Should be 0

city            2565057
exact            135399
poi               43710
neighborhood       8242
Name: place_type, dtype: int64


0

In [30]:
# Check consistency of within matches
df["within_success"] = (df["mun_name_within"] == df["place_name"]) | (df["place_type"] != "city") & (df["mun_name_within"].notna())
df.loc[df["within_success"] == False, ["place_name", "mun_name_within"]].value_counts(dropna=False)


place_name                           mun_name_within       
Gustavo A. Madero                    Tlalnepantla de Baz       44519
Álvaro Obregón                       La Magdalena Contreras    40362
Tijuana                              Playas de Rosarito        29530
Torreón                              Viesca                    25044
Chihuahua                            Aldama                    20677
                                                               ...  
Playa Bonfil                         NaN                           1
Playa De Las Gaviotas                NaN                           1
Playa De Tuxpan                      NaN                           1
Playa Del Carmen Eats & Drinks       NaN                           1
Parque Nacional Lagunas De Chacahua  NaN                           1
Length: 382, dtype: int64

In [31]:
# Check consistency of nearest matches
df["nearest_success"] = (((df["mun_name_nearest"] == df["place_name"]) & (df["place_type"] == "city")) |
                         (df["mun_name_within"].isna()) & (df["place_type"] != "city"))
df["success"] = df["within_success"] | df["nearest_success"]
df.loc[df["success"] == False,
       ["place_name", "mun_name_nearest", "mun_name_within"]].value_counts(dropna=False)


place_name                              mun_name_nearest                          mun_name_within                         
Tlaquepaque                             San Pedro Tlaquepaque                     San Pedro Tlaquepaque                       9915
Guaymas                                 Empalme                                   NaN                                         6048
Silao                                   Silao de la Victoria                      Silao de la Victoria                        1598
José Azueta                             Zihuatanejo de Azueta                     Zihuatanejo de Azueta                       1409
San Pedro Mixtepec-Distrito 22          San Pedro Mixtepec -Dto. 22 -             San Pedro Mixtepec -Dto. 22 -                722
Teoloyucán                              Teoloyucan                                Teoloyucan                                   651
Champotón                               Tenabo                                    NaN      

In [32]:
# Check if inconsistent but within and nearest select same municipality
df.loc[(df["success"] == False) & (df["mun_name_within"]==df["mun_name_nearest"]),
       ["place_name",  "mun_name_nearest", "mun_name_within",
        "state_id_nearest", "state_id_within"]].value_counts(dropna=False)


place_name                              mun_name_nearest                          mun_name_within                           state_id_nearest                 state_id_within                
Tlaquepaque                             San Pedro Tlaquepaque                     San Pedro Tlaquepaque                     Jalisco                          Jalisco                            9915
Silao                                   Silao de la Victoria                      Silao de la Victoria                      Guanajuato                       Guanajuato                         1598
José Azueta                             Zihuatanejo de Azueta                     Zihuatanejo de Azueta                     Guerrero                         Guerrero                           1409
San Pedro Mixtepec-Distrito 22          San Pedro Mixtepec -Dto. 22 -             San Pedro Mixtepec -Dto. 22 -             Oaxaca                           Oaxaca                              722
Teoloyucán             

In [33]:
# Check if inconsistent and within and nearest select different municipality
df.loc[(df["success"] == False) & (df["mun_name_within"]!=df["mun_name_nearest"]),
       ["place_name", "mun_name_nearest", "mun_name_within"]].value_counts(dropna=False)


place_name  mun_name_nearest  mun_name_within
Guaymas     Empalme           NaN                6048
Champotón   Tenabo            NaN                 609
dtype: int64

In [34]:
# Generate final merge ID and name
df["GEOID"] = df.apply(lambda x:
                       x["mun_id_within"] if x["within_success"] == True
                       else (x["mun_id_nearest"] if x["nearest_success"] == True
                       else (x["mun_id_nearest"] if x["mun_id_nearest"] == x["mun_id_within"]
                             else np.nan)),
                       axis=1)

df["NAME"] = df.apply(lambda x:
                       x["mun_name_within"] if x["within_success"] == True
                       else (x["mun_name_nearest"] if x["nearest_success"] == True
                       else (x["mun_name_nearest"] if (x["mun_id_nearest"] == x["mun_id_within"])
                             else np.nan)),
                       axis=1)



In [35]:
# Inspect cases that were not merged
df.loc[df["GEOID"].isna(), ["GEOID", "mun_name_within", "mun_name_nearest"]].value_counts(dropna=False)

GEOID  mun_name_within  mun_name_nearest
NaN    NaN              Empalme             6048
                        Tenabo               609
dtype: int64

In [36]:
# Merge over name (and state) if not merged
df = df.merge(muns[["NAME", "ADM1_ES", "GEOID"]].rename({"NAME": "mun_name_merge",
                                                   "GEOID": "mun_id_merge"}, axis=1),
                                  how="left", left_on = ["place_name", "state_id_nearest"],
                                  right_on= ["mun_name_merge", "ADM1_ES"]).drop("ADM1_ES", axis=1)


In [37]:
# Fill missings with merged IDs
df.loc[df["GEOID"].isna(), "GEOID"] = df.loc[df["GEOID"].isna(), "mun_id_merge"]
df.loc[df["GEOID"].isna(), "NAME"] = df.loc[df["GEOID"].isna(), "mun_name_merge"]

In [38]:
# Manually correct wrongly assigned cases:
print(muns.loc[muns["NAME"] == "Axapusco", "GEOID"])

df.loc[(df["place_name"] =="Axapusco") & (df["success"] == False), "NAME"] = "Axapusco"
df.loc[(df["place_name"] =="Axapusco") &
       (df["success"] == False), "GEOID"] = "MX15016"

187    MX15016
Name: GEOID, dtype: object


In [39]:
df["GEOID"].isna().sum() # Should be 0


0

In [40]:
df.drop(df.loc[:,"mun_id_within":"success"], axis=1, inplace=True)
df.drop(df.loc[:,["mun_name_merge", "mun_id_merge", "coordinates", "best_coords"]], axis=1, inplace=True)
df.columns

Index(['id', 'created_at', 'tweet', 'user_id', 'user_followers',
       'user_friends', 'user_statuses', 'user_verified', 'user_description',
       'user_created_at', 'place_type', 'place_name', 'bounding_box',
       'is_quote_status', 'source', 'approx_coords', 'long', 'lat', 'geometry',
       'GEOID', 'NAME'],
      dtype='object')

In [41]:
# Inspect duplicates
df["dupl_id"] = df.duplicated(subset='id', keep=False)
print(df.dupl_id.sum())
dupls = df[df["dupl_id"]].sort_values(by=["user_id", "id"])
dupls.head(6)

1258


Unnamed: 0,id,created_at,tweet,user_id,user_followers,user_friends,user_statuses,user_verified,user_description,user_created_at,...,bounding_box,is_quote_status,source,approx_coords,long,lat,geometry,GEOID,NAME,dupl_id
497280,1431274053462016000,Fri Aug 27 15:15:00 +0000 2021,M A M E S! 😶\nOjalá todas las pilotos estén bien.\n🥺,100034845,1316,1906,53640,False,Un iluso más que sueña que ya no le debe al SAT.,Mon Dec 28 20:04:28 +0000 2009,...,"[[-99.259495, 19.386371], [-99.259495, 19.473498], [-99.163526, 19.473498], [-99.163526, 19.3863...",True,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","[-99.2115105, 19.4299345]",-99.211511,19.429935,POINT (-348486.851 -379362.716),MX09016,Miguel Hidalgo,True
497282,1431274053462016000,Fri Aug 27 15:15:00 +0000 2021,M A M E S! 😶\nOjalá todas las pilotos estén bien.\n🥺,100034845,1316,1906,53640,False,Un iluso más que sueña que ya no le debe al SAT.,Mon Dec 28 20:04:28 +0000 2009,...,"[[-99.259495, 19.386371], [-99.259495, 19.473498], [-99.163526, 19.473498], [-99.163526, 19.3863...",True,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","[-99.2115105, 19.4299345]",-99.211511,19.429935,POINT (-348486.851 -379362.716),MX09016,Miguel Hidalgo,True
692104,1428374119859593221,Thu Aug 19 15:11:42 +0000 2021,"¡Malos días! Amaneci enojado por culpa de las pendejas chivititas, quw además de frustrarme por ...",1002350288538349568,265,422,5162,False,"Lo bueno no es tener poder, sino tener el teléfono de quién lo tiene..😼\n\n🇲🇽🇺🇸🇨🇱🇩🇴🇨🇦🇫🇷🇦🇪🇨🇴🔜",Fri Jun 01 00:45:06 +0000 2018,...,"[[-103.408633, 20.600585], [-103.408633, 20.752719], [-103.263131, 20.752719], [-103.263131, 20....",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-103.335882, 20.676651999999997]",-103.335882,20.676652,POINT (-785065.225 -221037.261),MX14039,Guadalajara,True
692105,1428374119859593221,Thu Aug 19 15:11:42 +0000 2021,"¡Malos días! Amaneci enojado por culpa de las pendejas chivititas, quw además de frustrarme por ...",1002350288538349568,265,422,5162,False,"Lo bueno no es tener poder, sino tener el teléfono de quién lo tiene..😼\n\n🇲🇽🇺🇸🇨🇱🇩🇴🇨🇦🇫🇷🇦🇪🇨🇴🔜",Fri Jun 01 00:45:06 +0000 2018,...,"[[-103.408633, 20.600585], [-103.408633, 20.752719], [-103.263131, 20.752719], [-103.263131, 20....",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-103.335882, 20.676651999999997]",-103.335882,20.676652,POINT (-785065.225 -221037.261),MX14039,Guadalajara,True
692106,1428374119859593221,Thu Aug 19 15:11:42 +0000 2021,"¡Malos días! Amaneci enojado por culpa de las pendejas chivititas, quw además de frustrarme por ...",1002350288538349568,265,422,5162,False,"Lo bueno no es tener poder, sino tener el teléfono de quién lo tiene..😼\n\n🇲🇽🇺🇸🇨🇱🇩🇴🇨🇦🇫🇷🇦🇪🇨🇴🔜",Fri Jun 01 00:45:06 +0000 2018,...,"[[-103.408633, 20.600585], [-103.408633, 20.752719], [-103.263131, 20.752719], [-103.263131, 20....",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-103.335882, 20.676651999999997]",-103.335882,20.676652,POINT (-785065.225 -221037.261),MX14039,Guadalajara,True
1487821,1427291502682640388,Mon Aug 16 15:29:46 +0000 2021,@KarlaGzz98 @dianamrls99 Se me acaba de caer un ídolo u.u,1006751629205389312,52,245,3424,False,"22🌚// ""Decidir es renunciar""",Wed Jun 13 04:14:27 +0000 2018,...,"[[-100.421037, 25.480538], [-100.421037, 25.802899], [-100.166146, 25.802899], [-100.166146, 25....",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-100.29359149999999, 25.6417185]",-100.293591,25.641718,POINT (-435408.715 298257.716),MX19039,Monterrey,True


In [42]:
# Drop duplicates
print(len(df))
df.drop_duplicates(subset='id', keep='first', inplace=True)
df.drop(["dupl_id"], axis=1, inplace=True )
print(len(df))


2752408
2751705


## 3. Exlude bots


### 3.1 Inspect source of tweets

In [43]:
df.source.value_counts().head(10)

<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>                     1720860
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>                        887757
<a href="http://instagram.com" rel="nofollow">Instagram</a>                                               116650
<a href="http://foursquare.com" rel="nofollow">Foursquare</a>                                              11234
<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>                           6544
<a href="http://www.elsiglodetorreon.com.mx/" rel="nofollow">El Siglo</a>                                   3249
<a href="http://tapbots.com/tweetbot" rel="nofollow">Tweetbot for iΟS</a>                                   1152
<a href="https://www.swarmapp.com" rel="nofollow">Foursquare Swarm</a>                                      1126
<a href="http://www.siglo.mx" rel="nofollow">Siglo Coahuila</a>                                 

In [44]:
# Inspect tweets from instagram
df[df.source == """<a href="http://instagram.com" rel="nofollow">Instagram</a>"""].head() # Does not look like bots

Unnamed: 0,id,created_at,tweet,user_id,user_followers,user_friends,user_statuses,user_verified,user_description,user_created_at,...,place_name,bounding_box,is_quote_status,source,approx_coords,long,lat,geometry,GEOID,NAME
7,1421636804302852097,Sun Aug 01 01:00:01 +0000 2021,Acaba de publicar una foto en Ixtapa Zihuatanejo https://t.co/48HLKFBBD1,2605305287,327,285,6739,False,"If the hurt comes, so will the happiness... be patient.",Fri Jun 13 03:19:02 +0000 2014,...,,"[[-101.714639, 17.540426], [-101.714639, 18.060683], [-101.199391, 18.060683], [-101.199391, 17....",False,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>","[-101.45701500000001, 17.8005545]",-101.60113,17.66184,POINT (-618664.257 -555652.693),MX12038,Zihuatanejo de Azueta
11,1421817009269100556,Sun Aug 01 12:56:05 +0000 2021,"Amaneciendo en Agosto. Que hermoso día 🙂 en Zihuatanejo, Guerrero https://t.co/LZcCHhUNPs",474255835,47,47,458,False,freeowner,Wed Jan 25 20:20:38 +0000 2012,...,,"[[-101.714639, 17.540426], [-101.714639, 18.060683], [-101.199391, 18.060683], [-101.199391, 17....",False,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>","[-101.45701500000001, 17.8005545]",-101.553603,17.642359,POINT (-613541.631 -558027.161),MX12038,Zihuatanejo de Azueta
16,1421851929437691909,Sun Aug 01 15:14:51 +0000 2021,El paisaje celebrante.\n🤩🌴🍸🍹🌞🎉🎊 en Las Brisas Ixtapa https://t.co/RFsxN6Q64l,588764504,272,111,16323,False,"ZURDO,TURISTÒLOGO, AMANTE DE LA HISTORIA Y LA MÚSICA, ALEGRE, GOURMAND, VIAJERO, DICHARACHERO, R...",Thu May 24 02:39:05 +0000 2012,...,,"[[-101.714639, 17.540426], [-101.714639, 18.060683], [-101.199391, 18.060683], [-101.199391, 17....",False,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>","[-101.45701500000001, 17.8005545]",-101.596853,17.648313,POINT (-618276.970 -557115.373),MX12038,Zihuatanejo de Azueta
22,1421880598507970567,Sun Aug 01 17:08:46 +0000 2021,"Nutrición, claro que sí, arrancando el mes con salud. Y tomando vitaminas para conservarla.\n\n#...",474255835,47,47,459,False,freeowner,Wed Jan 25 20:20:38 +0000 2012,...,,"[[-101.714639, 17.540426], [-101.714639, 18.060683], [-101.199391, 18.060683], [-101.199391, 17....",False,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>","[-101.45701500000001, 17.8005545]",-101.553603,17.642359,POINT (-613541.631 -558027.161),MX12038,Zihuatanejo de Azueta
31,1410672275762946061,Thu Jul 01 18:50:54 +0000 2021,"Desde lo alto. en Zihuatanejo, Guerrero https://t.co/RJXwH8gCxl",170180207,137,195,494,False,Por todos lados.,Sat Jul 24 03:53:38 +0000 2010,...,,"[[-101.714639, 17.540426], [-101.714639, 18.060683], [-101.199391, 18.060683], [-101.199391, 17....",False,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>","[-101.45701500000001, 17.8005545]",-101.553603,17.642359,POINT (-613541.631 -558027.161),MX12038,Zihuatanejo de Azueta


In [45]:
# Add column to identify tweets that are not posted through a third party API (???)
keep_sources = ["""<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>""",
                """<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>""",
                """<a href="http://instagram.com" rel="nofollow">Instagram</a>""",
                """<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>"""]

df["nobot"] = False
df.loc[df["source"].isin(keep_sources), "nobot"] = True
df[df["nobot"]].sample(10)

Unnamed: 0,id,created_at,tweet,user_id,user_followers,user_friends,user_statuses,user_verified,user_description,user_created_at,...,bounding_box,is_quote_status,source,approx_coords,long,lat,geometry,GEOID,NAME,nobot
1928481,1411451908737413120,Sat Jul 03 22:28:53 +0000 2021,@JLozanoA @LaCronicaDeHoy Para volverse locos,562661211,162,660,43064,False,,Wed Apr 25 04:08:47 +0000 2012,...,"[[-99.324375, 19.232228], [-99.324375, 19.403856], [-99.171644, 19.403856], [-99.171644, 19.2322...",False,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","[-99.2480095, 19.318042]",-99.248009,19.318042,POINT (-352855.335 -391203.357),MX09010,Álvaro Obregón,True
505341,1432536903790776322,Tue Aug 31 02:53:07 +0000 2021,Jajajajajajaja,1193898767180808193,752,1577,22809,False,Chileno en México 🇲🇽 • Tengo una banda de Rock 👉🏽 @amenazacl • #PorLaChucha #tupananchiskama,Mon Nov 11 14:30:35 +0000 2019,...,"[[-99.259495, 19.386371], [-99.259495, 19.473498], [-99.163526, 19.473498], [-99.163526, 19.3863...",True,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","[-99.2115105, 19.4299345]",-99.211511,19.429935,POINT (-348486.851 -379362.716),MX09016,Miguel Hidalgo,True
2744860,1416776192322064388,Sun Jul 18 15:05:41 +0000 2021,Confirmo queda prohibido decir vamos a dar la vuela.\nFechas Disponibles \nContacto: djnoesantos...,62951222,290,1068,1044,False,Tour Beat House ® 2k21 🎧🇲🇽🇱🇧\nContacto: djnoesantos@gmail.com Press Kit: http://goo.gl/T8WAUV,Tue Aug 04 22:38:27 +0000 2009,...,"[[-98.631598, 20.100414], [-98.631598, 20.349953], [-98.459922, 20.349953], [-98.459922, 20.1004...",False,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>","[-98.54576, 20.2251835]",-98.576389,20.203611,POINT (-277337.457 -298482.007),MX13024,Huasca de Ocampo,True
1261023,1427639426293829632,Tue Aug 17 14:32:18 +0000 2021,@cocina_facil_mx SI me gusta esta pasta es muy sabrosa muy fácil de preparar y sobretodo económica,1264784610682175488,133,112,4978,False,,Mon May 25 05:09:02 +0000 2020,...,"[[-99.127265, 19.589845], [-99.127265, 19.662165], [-99.053342, 19.662165], [-99.053342, 19.5898...",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-99.0903035, 19.626005]",-99.090304,19.626005,POINT (-334656.076 -358799.155),MX15020,Coacalco de Berriozábal,True
1219831,1412549336710451205,Tue Jul 06 23:09:40 +0000 2021,"@angiocito @PediatraConTOC Ayyyy no, Soñadoras es juvenil... infantil creo que podria ser “Carus...",83514124,5886,768,24143,False,"Mexicano, Jarocho, Infectólogo Pediatra, Barbón, Despistado, Cosecha 1986, Coatza-CDMX 😷🏳️‍🌈#Cro...",Mon Oct 19 03:35:15 +0000 2009,...,"[[-99.221136, 19.456367], [-99.221136, 19.5152], [-99.143116, 19.5152], [-99.143116, 19.456367]]",False,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>","[-99.18212600000001, 19.4857835]",-99.182126,19.485784,POINT (-345099.186 -373490.427),MX09002,Azcapotzalco,True
764667,1414086754458501122,Sun Jul 11 04:58:49 +0000 2021,"Este es el porcentaje reportado al día de hoy, en este país, así que aguas porque la tercera ola...",72190226,4816,1972,16989,False,"Esposo, padre, médico urgenciologo, alta especialidad en medicina de reanimación. Opiniones expr...",Mon Sep 07 03:06:23 +0000 2009,...,"[[-99.191996, 19.357102], [-99.191996, 19.404124], [-99.130965, 19.404124], [-99.130965, 19.3571...",False,"<a href=""http://twitter.com/#!/download/ipad"" rel=""nofollow"">Twitter for iPad</a>","[-99.16148050000001, 19.380613]",-99.161481,19.380613,POINT (-343235.722 -384824.245),MX09014,Benito Juárez,True
75637,1427040590856916994,Sun Aug 15 22:52:44 +0000 2021,@manunisima Estoy llorando tanto que con esa agua voy a ponerme a lavar trastes mañana,79315537,191,1175,5957,False,🇲🇽ÉL 🇫🇷IL 🇺🇲HE \nGastrónomo y Actor \n🏳️‍🌈🏳️‍🌈🏳️‍🌈\ncocino para los demás y\ncocino ideas para ...,Fri Oct 02 23:23:56 +0000 2009,...,"[[-99.18435, 19.399835], [-99.18435, 19.465837], [-99.122382, 19.465837], [-99.122382, 19.399835]]",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-99.153366, 19.432836]",-99.153366,19.432836,POINT (-342169.512 -379263.383),MX09015,Cuauhtémoc,True
839536,1422020166930550787,Mon Aug 02 02:23:22 +0000 2021,@TUDNMEX #YaSeArmó cuando faltan 30 días para los preparativos de mis padres porque cumplirán 30...,1414024001677107202,63,2273,2492,False,"Soy Estudiante de Comunicación. Aficionado del @clubamerica. Mi otra cuenta de Twitter, @AndiCis...",Sun Jul 11 00:49:41 +0000 2021,...,"[[-92.465201, 14.618977], [-92.465201, 15.222013], [-92.153698, 15.222013], [-92.153698, 14.6189...",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-92.3094495, 14.920494999999999]",-92.309449,14.920495,POINT (419001.924 -855111.219),MX07089,Tapachula,True
559632,1413171609234464768,Thu Jul 08 16:22:21 +0000 2021,@locatel_mx hola ¿Qué pasa si no puedo presentarme a mi vacuna el día que me toca? ¿Qué debo hac...,2185675572,671,4527,4585,False,Nada es incuestionable,Sun Nov 10 05:31:38 +0000 2013,...,"[[-99.364536, 19.232313], [-99.364536, 19.405081], [-99.246625, 19.405081], [-99.246625, 19.2323...",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-99.30558049999999, 19.318697]",-99.30558,19.318697,POINT (-359104.760 -390917.622),MX09004,Cuajimalpa de Morelos,True
513029,1432885694117171201,Wed Sep 01 01:59:05 +0000 2021,"@Edyamrdz Te faltaron los pulques que están del lado de mi Tomatlán tan querido pero sí, es una ...",1458070657,425,362,24392,False,Una existencia constantemente transfigurada por el fracaso.,Sat May 25 21:06:40 +0000 2013,...,"[[-99.140238, 19.284692], [-99.140238, 19.400885], [-98.960684, 19.400885], [-98.960684, 19.2846...",False,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>","[-99.050461, 19.342788499999997]",-99.050461,19.342788,POINT (-331316.756 -389266.392),MX09007,Iztapalapa,True


In [46]:
df["nobot"].value_counts()

True     2731811
False      19894
Name: nobot, dtype: int64

### 3.2. Inspect users with high number of statuses

In [47]:
df.groupby("nobot")["user_statuses"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
nobot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,19894.0,183680.869458,298348.003316,1.0,12457.25,35349.5,159480.0,847680.0
True,2731811.0,24458.644895,66997.915329,1.0,2154.0,8560.0,25547.0,1863973.0


In [48]:
df.loc[(df["user_statuses"] > 200000) & (df["nobot"]),
             ["NAME", "tweet", "user_id", "user_description", "source"]].drop_duplicates(subset="user_id").head(10) # Does not look like bots

Unnamed: 0,NAME,tweet,user_id,user_description,source
12,Zihuatanejo de Azueta,Aló Twitterland,49227540,"Gato, costeña, periodista, activista, obradorista, artista, guerrerense. Amo el café. Enamorada ...","<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"
1701,Ecatepec de Morelos,Dios los hace y los\nHijos de PUTLA\nSe juntaron para Robar https://t.co/ZfQSSyDukN,2281641013,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"
1875,Ecatepec de Morelos,@espnsutcliffe Medalla para Alexa Moreno?,95327125,We do what we want. We don't care what anyone else thinks!!!\r\n\r\nCliff Burton,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"
1879,Ecatepec de Morelos,@neria_ramos Amén 😇,163552083,"Creo en un México de bienestar para todos, es posible ¡Sí...todos participamos ! 🙏😇😊🇲🇽❤️","<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"
2785,Ecatepec de Morelos,Just posted a photo @ Los Bisquets Obregón Las Américas https://t.co/cv73S8xq11,74934662,"•Si no sabes cómo seguir; Improvisa• || Da igual cómo te vean. Tú, ¿te ves? || ℒ𝑜𝓈 𝒶𝒸𝒸𝑒𝓈𝑜𝓇𝒾𝑜𝓈 𝓈𝑒...","<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>"
4613,Ecatepec de Morelos,"@elchairoAMLO Buen día, provecho ☕",93085761,"Economista experto en planeación, operación y evaluación de proyectos.\nPendejeo gratis a quien ...","<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"
20180,Ecatepec de Morelos,@GinaHdezMD 😱😱😱😱 ay si!!! Veñ te enseño 🙈🙈,64991514,Lord Sith | Investigador de tiempo completo del fenómeno OVNI 🛸 | Próximo maestro taquero 🌮 | ma...,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>"
23175,Cuauhtémoc,"Ganar y romper un récord es increíble, pero la manera en que #TatjanaSchoemaker celebra lo es to...",95294154,"Stivi De TiVi (Enrique Solorzano). Mi vida es la televisión y el cine. Me encuentras en radio, T...","<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>"
23323,Miguel Hidalgo,La de hoy #SomosÁguilas https://t.co/pPICuWV0AS,4041161,"¿? y aprendiz de todo, oficial de nada. Un humano más y creo que estaré estudiando toda la vida....","<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"
23552,Cuauhtémoc,"Cualquier lugar que ponga a Zoé, tiene mi aprobación 😾 ™",2427984810,Transfer serigráfico sobre textiles * Transfer Orgánico sobre vidrio. Mezcal El Rogón 🌿 #HCongre...,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"


### 3.3. Exclude bots using source of tweets

In [49]:
# Keep only: iPhone, Android, Instagram, iPad
df = df[df["nobot"]]
df.drop("nobot", axis=1, inplace=True)

## 4. Compute local time for each tweet

In [50]:
# Inspect format (UCT time)
df["created_at"].head()

0    Sat Jul 31 21:59:54 +0000 2021
1    Sat Jul 31 22:05:36 +0000 2021
2    Sat Jul 31 22:31:00 +0000 2021
3    Sat Jul 31 22:40:59 +0000 2021
4    Sat Jul 31 22:53:44 +0000 2021
Name: created_at, dtype: object

In [51]:
# Compute time zone based on coordinates
from tzwhere import tzwhere # Not sure why, but needs to be imported here
tzwhere = tzwhere.tzwhere(forceTZ=True)
df["timezone"] = df.apply(lambda x: tzwhere.tzNameAt(x["lat"], x["long"], forceTZ=True), axis=1)
print(sum(df["timezone"].isna())) # Should be none
df["timezone"].value_counts().head()

  self.timezoneNamesToPolygons[tzname] = WRAP(polys)
  self.unprepTimezoneNamesToPolygons[tzname] = WRAP(polys)


0


America/Mexico_City    1813732
America/Monterrey       425209
America/Mazatlan        103044
America/Merida          102214
America/Hermosillo       90120
Name: timezone, dtype: int64

In [52]:
# Compute local time for each tweet
df["created_at"] = pd.to_datetime(df["created_at"], format = '%a %b %d %H:%M:%S +0000 %Y', utc=True)
def tz_func(x):
    return x.dt.tz_convert(x.name).dt.tz_localize(tz=None)
df["local_time"] = df.groupby("timezone")["created_at"].transform(tz_func)
df[["created_at", "timezone", "local_time"]].head()

Unnamed: 0,created_at,timezone,local_time
0,2021-07-31 21:59:54+00:00,America/Mexico_City,2021-07-31 16:59:54
1,2021-07-31 22:05:36+00:00,America/Mexico_City,2021-07-31 17:05:36
2,2021-07-31 22:31:00+00:00,America/Mexico_City,2021-07-31 17:31:00
3,2021-07-31 22:40:59+00:00,America/Mexico_City,2021-07-31 17:40:59
4,2021-07-31 22:53:44+00:00,America/Mexico_City,2021-07-31 17:53:44


In [53]:
# Create separate variables for day and time
df["hour"] = df["local_time"].dt.hour
df["day"] = df["local_time"].dt.weekday # 0 = Monday, 6 = Sunday
print(df["hour"].value_counts())
print(df["day"].value_counts())

22    173431
21    169916
20    158345
11    154222
12    151466
10    151452
13    149286
19    147597
23    145336
14    143146
15    141314
9     140027
18    138888
16    138252
17    137784
8     120132
0      96539
7      81446
1      54837
6      43370
2      31249
5      25149
3      20252
4      18375
Name: hour, dtype: int64
1    402437
4    402292
3    397697
6    387602
0    385619
2    380157
5    376007
Name: day, dtype: int64


In [54]:
# Define weekdays
df["weekday"] = df["day"] < 5
df["weekday"].value_counts()

True     1968202
False     763609
Name: weekday, dtype: int64

In [55]:
# Distinguish between free time and worktime
df["workhours"] = (df["weekday"]) & (df["hour"] >= 9) & (df["hour"] < 17) # 9-5 (Does this apply for MX?)
df["workhours"].value_counts()


False    1886357
True      845454
Name: workhours, dtype: int64

## 5. Handle users that tweet from different muns

In [56]:
# Inspect nr of muns users tweet from
muns_per_user = df.groupby(["user_id", "GEOID"]).count()
muns_per_user = muns_per_user.groupby("user_id")["long"].count().sort_values(ascending=False)
print(muns_per_user.describe())
muns_per_user.value_counts(normalize=True)

count    129562.000000
mean          1.738341
std           1.567180
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          91.000000
Name: long, dtype: float64


1     0.646964
2     0.192302
3     0.076373
4     0.035944
5     0.019605
6     0.010258
7     0.006128
8     0.004376
9     0.002547
10    0.001567
11    0.001165
12    0.000741
13    0.000502
14    0.000347
15    0.000216
16    0.000162
17    0.000123
21    0.000093
18    0.000077
19    0.000077
20    0.000062
22    0.000054
24    0.000046
23    0.000039
26    0.000031
27    0.000023
28    0.000023
30    0.000023
38    0.000023
33    0.000023
29    0.000015
25    0.000015
31    0.000015
37    0.000008
43    0.000008
34    0.000008
32    0.000008
91    0.000008
Name: long, dtype: float64

In [57]:
# Inspect users who tweet from many municipalities
df["nr_muns"] = df.groupby("user_id")["GEOID"].transform("nunique") # add variable for nr of muns each user tweets from
if mode == "full":
    df.loc[df["nr_muns"] > 5, ["NAME", "tweet", "user_description"]].sample(20)  # Looks like normal users (e.g. truck drivers)

In [58]:
# Get main municipality/ies for each user
df["main_mun_all"] = df.groupby("user_id")["GEOID"].transform(lambda x: ",".join(st.multimode(x)))
df["nr_main_muns"] = df.groupby("user_id")["GEOID"].transform(lambda x: len(st.multimode(x)))
print(df["main_mun_all"].value_counts().head())
print(df["nr_main_muns"].value_counts())
print(df["nr_main_muns"].value_counts(normalize=True))

MX19039    120079
MX09015    103171
MX09014     86753
MX14120     79585
MX14039     78735
Name: main_mun_all, dtype: int64
1    2703487
2      24079
3       3129
4        798
5        228
7         35
6         30
8         16
9          9
Name: nr_main_muns, dtype: int64
1    0.989632
2    0.008814
3    0.001145
4    0.000292
5    0.000083
7    0.000013
6    0.000011
8    0.000006
9    0.000003
Name: nr_main_muns, dtype: float64


In [59]:
# Get main municipality/ies for each user excluding tweets during workhours or weekends (when people might not be at home)
df["mun_home"] = df["GEOID"]*1
df["mun_home"].loc[(df["weekday"] == False) | (df["workhours"] == True)] = np.nan
df["main_mun_home"] = df.groupby("user_id")["mun_home"].transform(lambda x: ",".join(
    st.multimode(x[x.notna()])))

df["nr_home_muns"] = df.groupby("user_id")["mun_home"].transform(lambda x: len(st.multimode(x[x.notna()])))
print(df["main_mun_home"].value_counts().head())
print(df["nr_home_muns"].value_counts())
print(df["nr_home_muns"].value_counts(normalize=True))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mun_home"].loc[(df["weekday"] == False) | (df["workhours"] == True)] = np.nan


MX19039    115371
MX09015     93421
MX09014     85340
MX14120     77648
MX14039     74870
Name: main_mun_home, dtype: int64
1     2622631
0       73048
2       31297
3        3635
4         877
5         136
6          81
12         44
8          36
7          26
Name: nr_home_muns, dtype: int64
1     0.960034
0     0.026740
2     0.011457
3     0.001331
4     0.000321
5     0.000050
6     0.000030
12    0.000016
8     0.000013
7     0.000010
Name: nr_home_muns, dtype: float64


In [60]:
df[["main_mun_all", "main_mun_home", "nr_muns", "nr_home_muns"]].head(20)

Unnamed: 0,main_mun_all,main_mun_home,nr_muns,nr_home_muns
0,MX12038,MX12038,1,1
1,MX12038,MX12038,1,1
2,MX12038,MX12038,1,1
3,MX12038,MX12038,1,1
4,MX12038,MX12038,1,1
5,MX12038,MX12038,1,1
6,MX12038,MX12038,1,1
7,MX12038,MX11020,2,1
8,MX12038,MX12038,3,1
9,MX12038,MX12038,1,1


In [61]:
# Define var "main_mun": main mun if unambiguous, else main home mun (based on time) if unambiguous, else NaN
df.reset_index(drop=True, inplace=True)
df["main_mun"] = df.swifter.apply(lambda x:
                                     x["main_mun_all"] if x["nr_main_muns"] == 1
                                     else (x["main_mun_home"] if x["nr_home_muns"] == 1
                                           else np.nan),
                                     axis=1)
df["main_mun"].head()

Pandas Apply:   0%|          | 0/2731811 [00:00<?, ?it/s]

0    MX12038
1    MX12038
2    MX12038
3    MX12038
4    MX12038
Name: main_mun, dtype: object

In [62]:
# Inspect number of ambiguous cases
print(df["main_mun"].isna().sum())
print(df["main_mun"].isna().mean())

11305
0.0041382804300883185


In [69]:
df[["GEOID", "user_id", "main_mun_all", "main_mun"]].head()

Unnamed: 0,GEOID,user_id,main_mun_all,main_mun
0,MX12038,1394205255076823042,MX12038,MX12038
1,MX12038,1394205255076823042,MX12038,MX12038
2,MX12038,1394205255076823042,MX12038,MX12038
3,MX12038,1394205255076823042,MX12038,MX12038
4,MX12038,1394205255076823042,MX12038,MX12038


In [70]:
# Get mun-user for main mun
print("Total nr of muns: ", len(muns))
print("Nr of muns with at least one tweet: ", len(df["GEOID"].unique()))
print("Nr of muns with at least one tweet as main mun: ", len(df["main_mun"].unique()))

Total nr of muns:  2457
Nr of muns with at least one tweet:  1996
Nr of muns with at least one tweet as main mun:  1727


In [71]:
df.groupby("main_mun")["user_id"].count().describe()

count      1726.000000
mean       1576.191194
std        7665.212300
min           1.000000
25%           6.000000
50%          42.000000
75%         273.000000
max      120427.000000
Name: user_id, dtype: float64

## 6. Finalize and export dataset and ID file



### 6.1. Clean and add columns with additional information

In [72]:
df.columns

Index(['id', 'created_at', 'tweet', 'user_id', 'user_followers',
       'user_friends', 'user_statuses', 'user_verified', 'user_description',
       'user_created_at', 'place_type', 'place_name', 'bounding_box',
       'is_quote_status', 'source', 'approx_coords', 'long', 'lat', 'geometry',
       'GEOID', 'NAME', 'timezone', 'local_time', 'hour', 'day', 'weekday',
       'workhours', 'nr_muns', 'main_mun_all', 'nr_main_muns', 'mun_home',
       'main_mun_home', 'nr_home_muns', 'main_mun'],
      dtype='object')

In [73]:
# Drop redundant columns
df.drop(['main_mun_all',
         'nr_main_muns',
         'mun_home',
         'main_mun_home',
         'nr_home_muns',
         'geometry'],
        inplace=True,
        axis=1)


In [74]:
# mun name and ID
df.rename({"GEOID":"mun_id_tweet",
           "NAME" :"mun_name_tweet",
           "main_mun": "mun_id_user"},
          axis=1, inplace=True)
df = df.merge(muns[["GEOID", "NAME"]],
              left_on="mun_id_user",
              right_on="GEOID",
              how="inner")
df.drop("GEOID", inplace=True, axis=1)
df.rename({"NAME": "mun_name_user"}, axis=1, inplace=True)

In [75]:
# iPhone users
df["iphone"] = df["source"].isin(["""<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>""",
                                  """<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>"""])
df["iphone"].value_counts()

False    1829528
True      890978
Name: iphone, dtype: int64

In [76]:
# Instagram retweets
df["insta"] = df["source"] == """<a href="http://instagram.com" rel="nofollow">Instagram</a>"""
print(df["insta"].value_counts(normalize=True))

# Check Instagram tweets with default retweet text
insta = df[df["source"] == """<a href="http://instagram.com" rel="nofollow">Instagram</a>"""]
print(insta["tweet"].str.startswith("Acaba de publicar una foto").value_counts(normalize=True))

# Exclude Instagram tweets with default retweet text
df = df.loc[~((df["source"] == """<a href="http://instagram.com" rel="nofollow">Instagram</a>""") &
              (df["tweet"].str.startswith("Acaba de publicar una foto"))), :]

False    0.958469
True     0.041531
Name: insta, dtype: float64
False    0.701491
True     0.298509
Name: tweet, dtype: float64


In [77]:
# Length of tweet and user description
df["tweet_length"] = df["tweet"].str.len()
df["user_length"] = df["user_description"].str.len()
print(df["tweet_length"].describe())
print(df["user_length"].describe())

count    2.686779e+06
mean     9.173218e+01
std      7.286075e+01
min      4.000000e+00
25%      4.000000e+01
50%      6.700000e+01
75%      1.180000e+02
max      9.990000e+02
Name: tweet_length, dtype: float64
count    2.340248e+06
mean     8.488444e+01
std      4.926868e+01
min      1.000000e+00
25%      4.100000e+01
50%      8.300000e+01
75%      1.320000e+02
max      1.980000e+02
Name: user_length, dtype: float64


In [78]:
# Mobility of users
df.rename({"nr_muns":"user_nr_muns"}, inplace=True, axis=1)
df["user_nr_muns"].describe()

count    2.686779e+06
mean     3.756926e+00
std      4.044559e+00
min      1.000000e+00
25%      1.000000e+00
50%      3.000000e+00
75%      5.000000e+00
max      9.100000e+01
Name: user_nr_muns, dtype: float64

In [79]:
# Order columns
move_cols = ['mun_id_tweet', 'mun_name_tweet', 'mun_id_user', 'mun_name_user']
cols  = move_cols + [col for col in df.columns if col not in move_cols]
df = df[cols]
df.columns

Index(['mun_id_tweet', 'mun_name_tweet', 'mun_id_user', 'mun_name_user', 'id',
       'created_at', 'tweet', 'user_id', 'user_followers', 'user_friends',
       'user_statuses', 'user_verified', 'user_description', 'user_created_at',
       'place_type', 'place_name', 'bounding_box', 'is_quote_status', 'source',
       'approx_coords', 'long', 'lat', 'timezone', 'local_time', 'hour', 'day',
       'weekday', 'workhours', 'user_nr_muns', 'iphone', 'insta',
       'tweet_length', 'user_length'],
      dtype='object')

### 6.2. Export files


In [82]:
# Export ID file
df[["id", "user_id", "mun_id_tweet", "mun_id_user"]].to_csv(output_path + r"\ids.csv",
                                                                  encoding="utf-8",
                                                                  sep=";",
                                                                  index=False)

In [83]:
# Export full dataset (takes long)
df.to_csv(output_path + r"\tweet_data.csv",
          encoding="utf-8",
          sep=";",
          index=False)

In [84]:
# Nr of tweets in final sample
len(df)

2686779

In [85]:
# Nr of users in final sample
df["user_id"].nunique()

123309

In [None]:
# Delete LATER
# df = pd.read_csv(output_path + r"\tweet_data.csv",
#           encoding="utf-8",
#           sep=";")

In [86]:
df["place_type"].value_counts(normalize=True)

city            0.951762
exact           0.029535
poi             0.015859
neighborhood    0.002845
Name: place_type, dtype: float64