In [9]:
# Import libraries
import pandas as pd
import unidecode
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from langdetect import detect
from collections import Counter

# NLP definitions
stemmer = SnowballStemmer('spanish')
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer()

In [3]:
def process_tweet(sentence):
    """ Process sentece by deleting accents, lowercasing, stemming, 
    deleting stopwords and cleaning not useful elements (rt, http)"""
    sentence = sentence.lower()
    sentence = unidecode.unidecode(sentence)
    words = sentence.split()
    if len(words) > 1:
        if words[0].lower() == "rt" and "@" in words[1]:
            del words[1]
    sentence = " ".join(words)
    words = tokenizer.tokenize(sentence)
    if detect(row["tweet"]) == "es":
        words = [w for w in words if not w in stopwords.words("spanish")] # Remove stopwords
    if detect(row["tweet"]) == "ca":
        words = [w for w in words if not w in stopwords.words("catalan.txt")]
    if detect(row["tweet"]) == "eu":
        words = [w for w in words if not w in stopwords.words("basque.txt")]
    words = [w for w in words if ("rt" != w.lower()) if (not "http" in w)]     # Delete words containing url or RT word 
    words = [stemmer.stem(w) for w in words] # Stemming spanish
    sentence = " ".join(words)
    return sentence

In [4]:
# Import dataset
df_tweets= pd.read_csv("tweet_dataset.csv")

In [5]:
# Load [Name, Coordinates, Radius] of each Autonomous Community]
Andalusia = ["Andalusia", "37.3399964,-4.5811614,250km"]
Madrid = ["Madrid", "40.5248319,-3.7715628,60km"]
Catalonia = ["Catalonia", "41.8523094,1.5745043,150km"]
Canary_Islands = ["Canary_Islands", "28.5306525,-15.7464439,400km"]
Basque_Country = ["Basque_Country", "42.9911816,-2.5543023,100km"]

CAS = [Andalusia, Madrid, Catalonia, Basque_Country, Canary_Islands]

In [6]:
# Process tweets from dataset
processed_dataset = []
n_rows = len(df_tweets["location"])
actual_row = 1
for CA in CAS:
    for index, row in df_tweets[df_tweets["location"] == CA[0]].iterrows():
        try: 
            if detect(row["tweet"]) == "es" or detect(row["tweet"]) == "ca" or detect(row["tweet"]) == "eu":
                sentence = process_tweet(row[1])
                processed_dataset.append([row[0], sentence, row[2]])   
                if actual_row%100 == 0:
                    print("Row {} of {}".format(actual_row, n_rows))
                actual_row += 1
        except:
            "Ignore tweet if exception"  

Row 100 of 61903
Row 200 of 61903
Row 300 of 61903
Row 400 of 61903
Row 500 of 61903
Row 600 of 61903
Row 700 of 61903
Row 800 of 61903
Row 900 of 61903
Row 1000 of 61903
Row 1100 of 61903
Row 1200 of 61903
Row 1300 of 61903
Row 1400 of 61903
Row 1500 of 61903
Row 1600 of 61903
Row 1700 of 61903
Row 1800 of 61903
Row 1900 of 61903
Row 2000 of 61903
Row 2100 of 61903
Row 2200 of 61903
Row 2300 of 61903
Row 2400 of 61903
Row 2500 of 61903
Row 2600 of 61903
Row 2700 of 61903
Row 2800 of 61903
Row 2900 of 61903
Row 3000 of 61903
Row 3100 of 61903
Row 3200 of 61903
Row 3300 of 61903
Row 3400 of 61903
Row 3500 of 61903
Row 3600 of 61903
Row 3700 of 61903
Row 3800 of 61903
Row 3900 of 61903
Row 4000 of 61903
Row 4100 of 61903
Row 4200 of 61903
Row 4300 of 61903
Row 4400 of 61903
Row 4500 of 61903
Row 4600 of 61903
Row 4700 of 61903
Row 4800 of 61903
Row 4900 of 61903
Row 5000 of 61903
Row 5100 of 61903
Row 5200 of 61903
Row 5300 of 61903
Row 5400 of 61903
Row 5500 of 61903
Row 5600 of 61903
R

Row 43800 of 61903
Row 43900 of 61903
Row 44000 of 61903
Row 44100 of 61903
Row 44200 of 61903
Row 44300 of 61903
Row 44400 of 61903
Row 44500 of 61903
Row 44600 of 61903
Row 44700 of 61903
Row 44800 of 61903
Row 44900 of 61903
Row 45000 of 61903
Row 45100 of 61903
Row 45200 of 61903


In [7]:
# Transform to dataframe
df_processed = pd.DataFrame(processed_dataset, columns = ["id", "tweet", "location"])

In [10]:
count_CA = Counter(list(df_processed["location"])).most_common(5)
print(count_CA)

[('Andalusia', 10291), ('Madrid', 9670), ('Basque_Country', 9130), ('Catalonia', 8858), ('Canary_Islands', 7268)]


In [11]:
# Save dataframe as csv
df_processed.to_csv("tweet_dataset_processed.csv", encoding='utf-8-sig', index = False)

In [12]:
df_processed = pd.read_csv("tweet_dataset_processed.csv")

In [13]:
df_processed.head()

Unnamed: 0,id,tweet,location
0,625707906,dia ayer amig cit emple call av irigoy 1334 av...,Andalusia
1,625707906,shaval shaval voy bedford ano vien t co aoxmpm...,Andalusia
2,625707906,hac falt pens sab llev segund fot pulser lumin...,Andalusia
3,625707906,imagin ser asi ridicul pens tod luc movil cans...,Andalusia
4,625707906,k recuerd d bibi limit t co ozagbzlngw,Andalusia
