# Data Cleaning

## Importamos los datasets

In [1]:
import numpy as np
import pandas as pd
import string


In [2]:
data_main = pd.read_csv('../../raw_data/aggression_parsed_dataset.csv',usecols=['Text','oh_label'])
data_main.head()

Unnamed: 0,Text,oh_label
0,`- This is not ``creative``. Those are the di...,0
1,` :: the term ``standard model`` is itself le...,0
2,"True or false, the situation as of March 200...",0
3,"Next, maybe you could work on being less cond...",0
4,This page will need disambiguation.,0


In [3]:
data_main.shape

(115864, 2)

In [4]:
data_twitter = pd.read_csv('../../raw_data/twitter_parsed_dataset.csv',usecols=['Text','oh_label'])
data_twitter.head()

Unnamed: 0,Text,oh_label
0,@halalflaws @biebervalue @greenlinerzjm I read...,0.0
1,@ShreyaBafna3 Now you idiots claim that people...,0.0
2,"RT @Mooseoftorment Call me sexist, but when I ...",1.0
3,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",1.0
4,#mkr No No No No No No,0.0


In [5]:
data_twitter.shape

(16851, 2)

In [6]:
data_youtube = pd.read_csv('../../raw_data/youtube_parsed_dataset.csv',usecols=['Text','oh_label'])
data_youtube.head()

Unnamed: 0,Text,oh_label
0,Does N.e.bodyelse Hear her Crazy ass Screamin ...,0
1,There are so many things that are incorrect wi...,0
2,3:26 hahah my boyfriend showed this song to me...,1
3,dick beyonce fuck y a ass hole you are truely ...,1
4,DongHaeTaemin and Kai ;A; luhansehun and bacon...,0


In [7]:
data_kaggle = pd.read_csv('../../raw_data/kaggle_parsed_dataset.csv',usecols=['Text','oh_label'])
data_kaggle.head()

Unnamed: 0,oh_label,Text
0,1,"""You fuck your dad."""
1,0,"""i really don't understand your point.\xa0 It ..."
2,0,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,"""listen if you dont wanna get married to a man..."
4,0,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [8]:
data_toxi = pd.read_csv('../../raw_data/toxicity_parsed_dataset.csv',usecols=['Text','oh_label'])
data_toxi.head()

Unnamed: 0,Text,oh_label
0,This: :One can make an analogy in mathematical...,0
1,` :Clarification for you (and Zundark's righ...,0
2,Elected or Electoral? JHK,0
3,`This is such a fun entry. Devotchka I once...,0
4,Please relate the ozone hole to increases in c...,0


In [9]:
data_toxi.shape

(159686, 2)

## Concatenamos todos los datasets

In [10]:
data = pd.concat([data_main,data_twitter,data_youtube,data_kaggle,data_toxi])
data.shape

(304664, 2)

In [11]:
data.head()

Unnamed: 0,Text,oh_label
0,`- This is not ``creative``. Those are the di...,0.0
1,` :: the term ``standard model`` is itself le...,0.0
2,"True or false, the situation as of March 200...",0.0
3,"Next, maybe you could work on being less cond...",0.0
4,This page will need disambiguation.,0.0


## Limpieza de datos

### Funcion "Clean data" (Punctuation+lowercase+numbers)

In [15]:
import demoji
def clean_data(text):
    
    text = str(text)
    
    #emoji translation
    demoji.replace_with_desc(text," ")
    
    # remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    # lower text
    text = text.lower()
    
    # remove numbers
    text = ''.join([w for w in text if not w.isdigit()]) 
    
    return text

In [16]:
data['Text'] = data['Text'].apply(clean_data)

In [17]:
data.rename(columns={'Text':'text','oh_label':'target'},inplace=True)

In [18]:
data.head()

Unnamed: 0,text,target
0,this is not creative those are the dictionar...,0.0
1,the term standard model is itself less npov...,0.0
2,true or false the situation as of march was...,0.0
3,next maybe you could work on being less conde...,0.0
4,this page will need disambiguation,0.0


In [19]:
data.describe()

Unnamed: 0,target
count,304661.0
mean,0.127072
std,0.333055
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


### Drop duplicates/Nan

In [20]:
data = data.drop_duplicates()

In [21]:
data.replace(to_replace=[""," "],value=np.nan,inplace=True)

In [22]:
data.dropna(inplace=True)

In [23]:
data.shape

(230155, 2)

### Rename columns

In [24]:
data.head()

Unnamed: 0,text,target
0,this is not creative those are the dictionar...,0.0
1,the term standard model is itself less npov...,0.0
2,true or false the situation as of march was...,0.0
3,next maybe you could work on being less conde...,0.0
4,this page will need disambiguation,0.0


In [25]:
data['text']=data['text'].astype(str)

In [26]:
data.shape

(230155, 2)

### Limitamos el largo de los textos a 150 palabras

In [27]:
data['text'] = data['text'].apply(lambda x: x[:150])

## Export data

In [28]:
data.to_csv('../../cyberbullying/data/data.csv',index=False)