In [154]:
import nltk
import emoji
import re
import string
from tqdm.notebook import tqdm
nltk.download('stopwords')
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer


class TextTokenizer():
    """
        Preprocess an input sentence, output new sentence

        *preprocess_steps: list
            None : Normal split sentence by space
            "base" : consists of to lower, remove punctuations, remove stopwords
            "stem" : Snollball Stem words in sentence
            "lemmatize" : Lemmatize words in sentence
            "remove_emojis" : Remove emojis from sentence
            "ngrams" : Add bigrams, trigrams to output tokens
            "replace_consecutive" : Trim off consecutive part in sentence
            

    """
    def __init__(self, preprocess_steps = None, max_length = None):

        self.max_length = max_length
        if preprocess_steps is not None:
            assert isinstance(preprocess_steps, list) , "preprocess_steps must be a list contains name of methods"
        self.preprocess_steps = preprocess_steps
        
        if preprocess_steps is not None:
            if "base" in preprocess_steps:
                self.punctuations = string.punctuation
                self.stopwords_list = stopwords.words("english")
            if "stem" in preprocess_steps:
                self.stemmer = SnowballStemmer('english')
            if "lemmatize" in preprocess_steps:
                self.lemmatizer = WordNetLemmatizer()

    def tokenize(self, sentence):
        tokens = sentence.split()
        if self.preprocess_steps is not None:
            tokens = self.clean(tokens, self.preprocess_steps)
        if self.max_length is not None:
            tokens = tokens[:self.max_length]
        tokens = " ".join(tokens)
        return tokens

    def remove_stopwords(self, tokens):
        if tokens in self.stopwords_list:
            return ''
        else:
            return tokens
    
    def add_n_grams(self, tokens):
        l = []
        if "2grams" in self.preprocess_steps or "ngrams" in self.preprocess_steps:
            bigrams = ngrams(tokens, 2)
            for i in bigrams:
                l.append(" ".join(i))
        if "3grams" in self.preprocess_steps or "ngrams" in self.preprocess_steps:
            trigrams = ngrams(tokens, 3)
            for j in trigrams:
                l.append(" ".join(j))
    
        return l

    def replace_consecutive(self, sentence):
        sentence = re.sub(r"(.)\1+", r"\1\1", sentence)
        return sentence

    def extract_emojis(self, sentence):
        plain_text = []
        emo = []
        for c in sentence:
            if c not in emoji.UNICODE_EMOJI:
                plain_text.append(c)
            else:
                tmp = emoji.demojize(c)
                emo.append(tmp[1:len(tmp)-1])
        plain_text = "".join(plain_text)
        return plain_text, emo

    def remove_punctuations(self, sentence):
        result = "".join([w if w not in self.punctuations and not w.isdigit() else "" for w in sentence])
        return result

    def word_lowercase(self, sentence):
        return sentence.lower()

    def word_stemmer(self, sentence):
        sentence = self.stemmer.stem(sentence)
        return sentence

    def clean(self, tokens, types):
        results = []
        emo_lst = set()
        for tok in tokens:
            tok, emos = self.extract_emojis(tok)  
            for emo in emos:
                emo_lst.add(emo)
            if "base" in types or "lower" in types:
                tok = self.word_lowercase(tok)
            
            if "base" in types or "remove_punctuaions" in types:
                tok = self.remove_punctuations(tok)
                if tok == '':
                        pass
            
            if "stem" in types:
                tok = self.word_stemmer(tok)
            
            if "replace_consecutive" in types:
                tok = self.replace_consecutive(tok)
            
            if (tok is not None) and (not tok.isspace()) and (tok!= ''): 
                results.append(tok)  

                
        grams = self.add_n_grams(results) if "ngrams" in types else []
        results = results + grams
        results = results + list(emo_lst)   
        if len(results) == 0:
            results.append('NaN')
        return results
    
simple_tokenizer = TextTokenizer(["base", "replace_consecutive"])
print(simple_tokenizer.tokenize('Very nice. Thank you 😊'))

very nice thank you smiling_face_with_smiling_eyes
[nltk_data] Downloading package stopwords to /home/ken/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [151]:
test = pd.read_csv('../extra_data/test_raw.csv')
test["review"] = test["review"].apply(lambda text: simple_tokenizer.tokenize(text))

In [153]:
test.to_csv('../extra_data/test.csv',index=False)

In [168]:
df = pd.read_csv('../extra_data/raw.csv')
df = df.rename(columns={"text": "review", "label": "rating"})
df.drop(df[df['rating']=='label'].index, inplace = True) # del noise row

train = pd.read_csv('../extra_data/train.csv')
train = train.drop(['review_id'], axis = 1)
#1502574
df = pd.concat([df, train], ignore_index=True)

In [169]:
df['review'] = df['review'].astype(str)
df['rating'] = df['rating'].astype(int)

In [52]:
def missing_value_of_data(data):
    total=data.isnull().sum().sort_values(ascending=False)
    percentage=round(total/data.shape[0]*100,2)
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])
missing_value_of_data(df)

Unnamed: 0,Total,Percentage
review,6,0.01
rating,0,0.0


In [64]:
def count_values_in_column(data,feature):
    total=data.loc[:,feature].value_counts(dropna=False)
    percentage=round(data.loc[:,feature].value_counts(dropna=False,normalize=True)*100,2)
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])
count_values_in_column(df,'rating')

Unnamed: 0,Total,Percentage
0,149378,35.97
1,133351,32.11
2,132524,31.91


In [28]:
def duplicated_values_data(data):
    dup=[]
    columns=data.columns
    for i in data.columns:
        dup.append(sum(data[i].duplicated()))
    return pd.concat([pd.Series(columns),pd.Series(dup)],axis=1,keys=['Columns','Duplicate count'])
duplicated_values_data(df)

Unnamed: 0,Columns,Duplicate count
0,rating,1616701
1,review,85396


In [173]:
df = df.drop_duplicates(subset=['review'])
duplicated_values_data(df)

Unnamed: 0,Columns,Duplicate count
0,rating,1616701
1,review,0


In [174]:
df["review"] = df["review"].apply(lambda text: simple_tokenizer.tokenize(text))
df.head(10)

Unnamed: 0,rating,review
0,5,looks ok not like so durable will hv to use a ...
1,5,tried the current can be very powerful dependi...
2,5,item received after a week looks smaller than ...
3,5,thanks works as describe no complaints not rea...
4,5,fast delivery considering it’s from overseas a...
5,5,fast delivery good service
6,5,got my order and it came well packaged have ye...
7,5,items received in a nice box have not used it ...
8,5,received in good condition tried so far so goo...
9,1,item doesn’t work asked me to send a refund sh...


In [175]:
df_small = df.copy()
df_merge = df.copy()
df_trunc = df.copy()

In [176]:
df_trunc.drop(df_trunc[df_trunc['rating']==5].index[:1345447 - 143994 ], inplace = True)

In [177]:
count_values_in_column(df_trunc,'rating')

Unnamed: 0,Total,Percentage
4,133351,32.11
5,132524,31.91
3,77159,18.58
1,42050,10.13
2,30169,7.27


In [178]:
df_merge['rating'] = np.where(df_merge.rating < 4, 0, df_merge.rating)
df_merge['rating'] = np.where(df_merge.rating == 4, 1, df_merge.rating)
df_merge['rating'] = np.where(df_merge.rating == 5, 2, df_merge.rating)
df_merge.drop(df_merge[df_merge['rating']==2].index[:1345447 - 143994 ], inplace = True)

In [179]:
count_values_in_column(df_merge,'rating')

Unnamed: 0,Total,Percentage
0,149378,35.97
1,133351,32.11
2,132524,31.91


In [180]:
df.to_csv('../extra_data/extra_full.csv',index=False)
df_merge.to_csv('../extra_data/extra_merge.csv',index=False)
df_trunc.to_csv('../extra_data/extra_small.csv',index=False)