In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras import losses

from collections import Counter

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pydot

In [2]:
df_digi_c = pd.read_csv('DIGI call.csv')

df_digi_c.shape

(240, 36)

In [3]:
df_celcom_c = pd.read_csv('CELCOM call.csv')

df_celcom_c.shape

(527, 36)

In [4]:
df_maxis_c = pd.read_csv('MAXIS call.csv')

df_maxis_c.shape

(357, 36)

In [5]:
df_umobile_c = pd.read_csv('UMOBILE call.csv')

df_umobile_c.shape

(235, 36)

In [6]:
df_unifi_c = pd.read_csv('UNIFI call.csv')

df_unifi_c.shape

(338, 36)

In [7]:
data_digi_c = df_digi_c[['id','tweet', 'date']]
data_celcom_c = df_celcom_c[['id', 'tweet', 'date']]
data_maxis_c = df_maxis_c[['id', 'tweet', 'date']]
data_umobile_c = df_umobile_c[['id', 'tweet', 'date']]
data_unifi_c = df_unifi_c[['id', 'tweet', 'date']]

In [8]:
data_digi_c = data_digi_c.assign(Label = 'Digi Call')
#data_digi.head()
data_digi_c.shape

(240, 4)

In [9]:
data_celcom_c = data_celcom_c.assign(Label = 'Celcom Call')
#data_celcom.head()
data_celcom_c.shape

(527, 4)

In [10]:
data_maxis_c = data_maxis_c.assign(Label = 'Maxis Call')
#data_maxis.head()
data_maxis_c.shape

(357, 4)

In [11]:
data_umobile_c = data_umobile_c.assign(Label = 'Umobile Call')
#data_umobile.head()
data_umobile_c.shape

(235, 4)

In [12]:
data_unifi_c = data_unifi_c.assign(Label = 'Unifi Call')
#data_unifi.head()
data_unifi_c.shape

(338, 4)

In [13]:
frames = [data_digi_c, data_celcom_c, data_maxis_c, data_umobile_c, data_unifi_c]

data = pd.concat(frames, ignore_index=True)
data.shape

(1697, 4)

In [14]:
data['id'].value_counts()

1273982497534013442    6
1306120353979281409    6
1257921197196509189    5
1261644944403386370    5
1268945290301542401    5
                      ..
1261173911078596609    1
1261174115018264577    1
1262278681830948865    1
1262278983724326913    1
1216798969478569984    1
Name: id, Length: 1513, dtype: int64

In [15]:
test_data = data.loc[data['id'] == 1273982497534013442]

In [16]:
test_data

Unnamed: 0,id,tweet,date,Label
26,1273982497534013442,@BefriendersKL @digitelco @myTMgroup @umobile ...,2020-06-19,Digi Call
179,1273982497534013442,@BefriendersKL @digitelco @myTMgroup @umobile ...,2020-06-19,Digi Call
601,1273982497534013442,@BefriendersKL @digitelco @myTMgroup @umobile ...,2020-06-19,Celcom Call
1026,1273982497534013442,@BefriendersKL @digitelco @myTMgroup @umobile ...,2020-06-19,Maxis Call
1247,1273982497534013442,@BefriendersKL @digitelco @myTMgroup @umobile ...,2020-06-19,Umobile Call
1591,1273982497534013442,@BefriendersKL @digitelco @myTMgroup @umobile ...,2020-06-19,Unifi Call


In [17]:
# dropping ALL duplicate values
data.drop_duplicates(subset ="id",
                     keep = False, inplace = True)

In [18]:
data['id'].value_counts()

1269546883262775299    1
1386311163152007172    1
1348601230638006275    1
1355429459256434691    1
1358273041508167680    1
                      ..
1262640656469520389    1
1262640764061806593    1
1262640939840897024    1
1262641161484636160    1
1216798969478569984    1
Name: id, Length: 1362, dtype: int64

In [19]:
data.shape

(1362, 4)

In [21]:
data['Label'].value_counts()

Celcom Call     462
Unifi Call      303
Maxis Call      299
Umobile Call    193
Digi Call       105
Name: Label, dtype: int64

In [23]:
data.to_csv('COMBINE DATA CALL.csv', header=True)

Preprocess Data

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)
 # converting return value from list to string



def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>2))]) 
    
    return text2.lower()

In [None]:
dataset = pd.read_csv('dataset/combine_data.csv')
dataset.dropna(axis = 0, how = 'any', inplace=True)
dataset['Num_words_text'] = dataset['tweet'].apply(lambda x:len(str(x).split())) 
mask = dataset['Num_words_text'] >2
dataset = dataset[mask]
print('-------Train data--------')
print(dataset['Label'].value_counts())
print(len(dataset))
print('-------------------------')
max_dataset_sentence_length  = dataset['Num_words_text'].max()

In [None]:
dataset['tweet'].head()

In [None]:
dataset['tweet'] = dataset['tweet'].apply(remove_emoji)
dataset['tweet'] = dataset['tweet'].apply(remove_url)
dataset['tweet'] = dataset['tweet'].apply(clean_text)

In [None]:
dataset['tweet'].head()

In [None]:
print('Dataset Max Sentence Length :'+str(max_dataset_sentence_length))