## Pre processing 

The following notebook keep tweets in spanish, remove accents, and other data engineering steps to create the data for our model. 

In [1]:
# Language detection 
import langid  
from langdetect import detect  
import textblob

# Data manipulation and other
import pandas as pd
import numpy as np
import datetime as dt
import os
import unidecode

In [2]:
# Current and data working directory
cwd = os.getcwd()
dwd = cwd + "\\data_csv\\"

In [3]:
# Import csv data sets
bolP  = pd.read_csv(dwd + "querybol_p.csv"   , low_memory = False)
bolNP = pd.read_csv(dwd + "querybol_np.csv"  , low_memory = False)
chiP  = pd.read_csv(dwd + "querychil.csv"    , low_memory = False)
chiNP = pd.read_csv(dwd + "querychil_np.csv" , low_memory = False)
colP  = pd.read_csv(dwd + "querycol_p.csv"   , low_memory = False)
colNP = pd.read_csv(dwd + "querycol_nonp.csv", low_memory = False)

### Language detection

In [4]:
# Create function to evaluate language
# Source: http://blog.manugarri.com/sentiment-analysis-in-spanish/

def langid_safe(tweet):  
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass

def langdetect_safe(tweet):  
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):  
    try:
        return textblob.TextBlob(tweet).detect_language()
    
    except Exception as e:
        pass 

In [5]:
# Create temp data to test language detect functions
temp = colP.iloc[:5].copy()
colP.text[:5]

0    Diss 30nov passejarem per les ribes del Brugen...
1    @IvanDuque el tocino sigue asesinando su puebl...
2    A ver si lo entienden quienes pretenden vender...
3    Luisa Amanda ha padecido las penurias de mante...
4    #29NParoNacional una manera de protestar es no...
Name: text, dtype: object

In [6]:
# Temp data to evaluate time per function 
%timeit temp['lang_langid']     = temp.text.apply(langid_safe)  
%timeit temp['lang_langdetect'] = temp.text.apply(langdetect_safe)  
%timeit temp['lang_textblob']   = temp.text.apply(textblob_safe)

17.5 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
27.7 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
319 ms ± 20.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
# Lists of dataframes
pdlist = [bolP, bolNP, chiP, chiNP, colP, colNP]

In [8]:
# Create language id 
for data in pdlist:
    data['lang_langid'] = data.text.apply(langid_safe)

In [None]:
# Keep tweets only in spanish
bolP  = bolP [bolP.lang_langid  == "es"]
bolNP = bolNP[bolNP.lang_langid == "es"]
chiP  = chiP [chiP.lang_langid  == "es"]
chiNP = chiNP[chiNP.lang_langid == "es"]
colP  = colP [colP.lang_langid  == "es"]
colNP = colNP[colNP.lang_langid == "es"]

In [None]:
# Validate how balance are the datasets
bolP.shape, bolNP.shape, chiP.shape, chiNP.shape, colP.shape, colNP.shape

### Data engineering (continuation)

1. Create protest and country id
2. Append datasets
3. Keep variables of interest
4. Create clean text with hashtags and mentioned users
5. Remove accents

In [None]:
# Create protest and country id 
bolP ["country"], bolP ["protest"] = ["bol",1]
bolNP["country"], bolNP["protest"] = ["bol",0]
chiP ["country"], chiP ["protest"] = ["chi",1] 
chiNP["country"], chiNP["protest"] = ["chi",0]
colP ["country"], colP ["protest"] = ["col",1] 
colNP["country"], colNP["protest"] = ["col",0]

In [None]:
# Append datasets
tweetslat = bolP.copy()
tweetslat = tweetslat.append(bolNP)
tweetslat = tweetslat.append(chiP)
tweetslat = tweetslat.append(chiNP)
tweetslat = tweetslat.append(colP)
tweetslat = tweetslat.append(colNP)

In [None]:
tweetslat.shape

In [None]:
# Select variable of interest
tweetslat = tweetslat[['user_id','country','protest',
                       'timestamp','text','hashtags','mentioned_users',
                       'likes','retweets',
                       'cleaned_text']]

In [None]:
# String variables to lowercase 
tweetslat['hashtags']        = tweetslat['hashtags'].apply(str.lower)
tweetslat['mentioned_users'] = tweetslat['mentioned_users'].apply(str.lower)

# Remove @ on mentioned users
tweetslat['mentioned_users'] = tweetslat['mentioned_users'].str.replace('@','')

# Hashtags, mentioned_users and cleaned_test from str list to list
tweetslat['hashtags']        = tweetslat['hashtags'].str.replace("'","").str.strip("][").str.split(",")
tweetslat['mentioned_users'] = tweetslat['mentioned_users'].str.replace("'","").str.strip("][").str.split(",")
tweetslat['cleaned_text']    = tweetslat['cleaned_text'].str.replace("'","").str.strip("][").str.split(",")

# Merge cleaned_text, hashtags and mentioned_users
tweetslat['clean_text_2']    = tweetslat['hashtags'] + tweetslat['mentioned_users'] + tweetslat['cleaned_text']

# Remove empty elements and accents
tweetslat['clean_text_2']    = tweetslat['clean_text_2'].apply(lambda x: [unidecode.unidecode(i) for i in x if len(i) > 0])

In [None]:
tweetslat.to_csv(dwd + "tweetslat.csv", index = False)