# Data cleaning

We clean the datasets collected in Step 2 (Data Collection) and re-saved them in the folder "_dataframes" as cvs files.

## Importing packages

In [1]:
import pandas as pd

## Importing datasets

We import the datasets previously generated and create a new one that concatenate all of them (excluding ML_Ayuda).

In [2]:
# Reading the datasets

df_ml = pd.read_csv("_dataset/Mercadolibre_tweets.csv")
df_argentina = pd.read_csv("_dataset/ML_Argentina_tweets.csv")
df_mexico = pd.read_csv("_dataset/ML_Mexico_tweets.csv")
df_venezuela = pd.read_csv("_dataset/ML_Venezuela_tweets.csv")
df_ayuda = pd.read_csv("_dataset/ML_Ayuda_tweets.csv")

# Adding column to identify account

df_ml.insert(loc=0, column='Account', value="@Mercadolibre")
df_argentina.insert(loc=0, column='Account', value="@ML_Argentina")
df_mexico.insert(loc=0, column='Account', value="@ML_Mexico")
df_venezuela.insert(loc=0, column='Account', value="@ML_Venezuela")
df_ayuda.insert(loc=0, column='Account', value="@ML_Ayuda")

# Join datasets

df = pd.concat([df_ml, df_argentina, df_mexico, df_venezuela])
parts = [df, df_ml, df_argentina, df_mexico, df_venezuela, df_ayuda]

### Cleaning 

First, we split the date´s column in four: year, month, day, hour

In [3]:
df_ayuda.rename(columns={'date': 'created_at'}, inplace=True)

In [4]:
df.head(3)

Unnamed: 0,Account,id,created_at,favorite_count,retweet_count,text
0,@Mercadolibre,1357071365279371265,2021-02-03 21:00:01,1838,143,En 2020 incorporamos un promedio de 40 emplead...
1,@Mercadolibre,1356708979716538373,2021-02-02 21:00:01,886,55,Salvaje usa nuestra caja para dormir la siesta...
2,@Mercadolibre,1355259421975187456,2021-01-29 21:00:00,936,78,"Día a día, más de 80 mil pymes argentinas vend..."


Lets split the date in four columns: year, month, day, hour

In [5]:
for frame in parts:
    frame['year'] = pd.DatetimeIndex(frame['created_at']).year
    frame['month'] = pd.DatetimeIndex(frame['created_at']).month
    frame['day'] = pd.DatetimeIndex(frame['created_at']).day
    frame['hour'] = pd.DatetimeIndex(frame['created_at']).hour

In [6]:
df.head(3)

Unnamed: 0,Account,id,created_at,favorite_count,retweet_count,text,year,month,day,hour
0,@Mercadolibre,1357071365279371265,2021-02-03 21:00:01,1838,143,En 2020 incorporamos un promedio de 40 emplead...,2021,2,3,21
1,@Mercadolibre,1356708979716538373,2021-02-02 21:00:01,886,55,Salvaje usa nuestra caja para dormir la siesta...,2021,2,2,21
2,@Mercadolibre,1355259421975187456,2021-01-29 21:00:00,936,78,"Día a día, más de 80 mil pymes argentinas vend...",2021,1,29,21


In [7]:
df_ml.head(3)

Unnamed: 0,Account,id,created_at,favorite_count,retweet_count,text,year,month,day,hour
0,@Mercadolibre,1357071365279371265,2021-02-03 21:00:01,1838,143,En 2020 incorporamos un promedio de 40 emplead...,2021,2,3,21
1,@Mercadolibre,1356708979716538373,2021-02-02 21:00:01,886,55,Salvaje usa nuestra caja para dormir la siesta...,2021,2,2,21
2,@Mercadolibre,1355259421975187456,2021-01-29 21:00:00,936,78,"Día a día, más de 80 mil pymes argentinas vend...",2021,1,29,21


In [8]:
df_argentina.head(3)

Unnamed: 0,Account,id,created_at,favorite_count,retweet_count,text,year,month,day,hour
0,@ML_Argentina,1344003160160022532,2020-12-29 19:31:38,143,8,🐶😍 https://t.co/Q7y7Vkgm13,2020,12,29,19
1,@ML_Argentina,1341390231082311685,2020-12-22 14:28:47,209,14,🎅💘 https://t.co/03QlciRlw4,2020,12,22,14
2,@ML_Argentina,1331702117791379457,2020-11-25 20:51:41,11818,592,Gracias Diego. https://t.co/PrWRwhf5Oi,2020,11,25,20


In [9]:
df_mexico.head(3)

Unnamed: 0,Account,id,created_at,favorite_count,retweet_count,text,year,month,day,hour
0,@ML_Mexico,1354564847510425607,2021-01-27 23:00:00,4,1,Nosotros te ayudamos a vender tu inmueble fáci...,2021,1,27,23
1,@ML_Mexico,1352360319402708994,2021-01-21 21:00:00,8,0,Corre y consigue tu moto ideal 🏍️ con hasta 25...,2021,1,21,21
2,@ML_Mexico,1351952673617162245,2021-01-20 18:00:10,9,1,¿Cuál beneficio 🤯 de las Súper Rebajas te emoc...,2021,1,20,18


In [10]:
df_venezuela.head(3)

Unnamed: 0,Account,id,created_at,favorite_count,retweet_count,text,year,month,day,hour
0,@ML_Venezuela,1357713281432117248,2021-02-05 15:30:46,1,0,"Y tú, ¿cómo vas a pasar Carnaval? Sin importar...",2021,2,5,15
1,@ML_Venezuela,1357088385660313601,2021-02-03 22:07:39,0,0,"Si realmente conoces a tu crush, regalarle alg...",2021,2,3,22
2,@ML_Venezuela,1356753312792670211,2021-02-02 23:56:11,0,0,"El amor está en el aire, solo tienes que encon...",2021,2,2,23


In [11]:
df_ayuda.head(3)

Unnamed: 0,Account,created_at,location,text,year,month,day,hour
0,@ML_Ayuda,2021-02-07 16:32:02,,@ML_Ayuda Explícame si lo puedo devolver y si ...,2021,2,7,16
1,@ML_Ayuda,2021-02-07 16:31:23,,@ML_Ayuda buen día compro normalmente con dos ...,2021,2,7,16
2,@ML_Ayuda,2021-02-07 16:31:22,,@ML_Ayuda La cuestión como saben cual es mi cu...,2021,2,7,16


In [12]:
df.to_csv('_dataframes/df_clean.csv',index=False)
df_ml.to_csv('_dataframes/df_ml_clean.csv',index=False)
df_argentina.to_csv('_dataframes/df_argentina_clean.csv',index=False)
df_mexico.to_csv('_dataframes/df_mexico_clean.csv',index=False)
df_venezuela.to_csv('_dataframes/df_venezuela_clean.csv',index=False)
df_ayuda.to_csv('_dataframes/df_ayuda_clean.csv',index=False)