In [2]:
import pandas as pd
import numpy as np
from string import punctuation
import regex as re
from collections import Counter

In [3]:
data = pd.read_csv("../data/texts.csv")
data.head()

Unnamed: 0,id,dialect,text
0,1175358310087892992,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,1175416117793349632,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...
2,1175450108898565888,IQ,@KanaanRema مبين من كلامه خليجي
3,1175471073770573824,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,1175496913145217024,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺


## Preprocessing text
- Removing accents, diacritics, emojis, latin characters, and numbers
- Collpasing hamza and other characters into more common characters to avoid mispellings
- Removing duplicate characters
- Splitting hashtags into their semantic equivalents
- Trimming words and removing extra whitespace

In [4]:
def preprocess_text(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    latin_pattern = "[A-z0-9]"

    punct_pattern = "["+punctuation+"،ـ#؛]"
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    
    patterns = [emoji_pattern,latin_pattern,punct_pattern,noise]
    

    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.compile(r'([\u0621-\u064A])\1{1,}').sub(r'\1',text)
    text = re.sub("_"," ",text)
    text = re.sub("[^\u0621-\u064A^\s]"," ",text)
    text = re.sub("\s{2,}"," ",text)
    
    for pattern in patterns:
        text = re.sub(pattern,"",text)
    
    text = text.strip()
    
    return text

In [5]:
data.text = data.text.apply(preprocess_text)

### Removing short sentences from the dataset 

In [6]:
data["length"] = data.text.apply(lambda x: len(x.split()))
data.head()

Unnamed: 0,id,dialect,text,length
0,1175358310087892992,IQ,لكن بالنهايه ينتفض يغير,4
1,1175416117793349632,IQ,يعني هذا محسوب علي البشر حيونه وحشيه وتطلبون م...,15
2,1175450108898565888,IQ,مبين من كلامه خليجي,4
3,1175471073770573824,IQ,يسلملي مرورك وروحك الحلوه,4
4,1175496913145217024,IQ,وين هل الغيبه اخ محمد,5


In [7]:
data = data[data.length > 3]
data.head()
data[["text","length","dialect"]].to_csv("../data/train_data.csv",index=False)