## Exercise 3 - Language Identification -Reloaded and Convoluted

In [20]:

#Importing the tools
import csv
import re
import numpy as np
import pandas as pd



Loading the data

In [21]:
url_train_dev = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTOZ2rC82rhNsJduoyKYTsVeH6ukd7Bpxvxn_afOibn3R-eadZGXu82eCU9IRpl4CK_gefEGsYrA_oM/pub?gid=1863430984&single=true&output=tsv'
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

Splitting the data into tweet and labels

In [22]:

from io import StringIO
import requests

def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    df.columns = ['tweet', 'label']
    return df
    

In [23]:
df_train  = load_dataset(url_train_dev)
df_test = load_dataset(url_test)
print(df_train[0:10])


                                               tweet label
0  يا من أناديها ويخنقني البكاء  ويكاد صمت الدمع ...    ar
1  فيه فرق بين اهل غزة اللى مطحونين من ناحيتين وب...    ar
2  ﻋﻦ ﺍﻟﻠﺤﻈﺔ اﻟﺤﻠﻮﺓﺓ ﺍﻟﻠﻲ ﺑﺘﻐﻤﺾ ﻓﻴﻬﺎ ﻋﻴﻨﻴﻚ ﺑﺘﻔﻜﺮ ...    ar
3                                  يا ابو سلو عرفتني    ar
4  ب50 ريال أكفل معتمر في رمضان ، ولك بإذن الله م...    ar
5  توجيه كيفية تثبيت البرامج الثابتة ROM التحميل ...    ar
6  {وأنه هو أغنى وأقنى} [النجم:48] http://t.co/is...    ar
7  اللهم قدر لنا الفرح بكل اشكاله ، انت الكريم ال...    ar
8  #غزه_تحت_القصف  داعش أخواني حيل عندكم بالمدنيي...    ar
9  {يعلمون ظاهرا من الحياة الدنيا وهم عن الآخرة ه...    ar


Data Preprocessing

In [24]:
#Converting all the tweets to lower string
df_train["tweet"] = df_train["tweet"].str.lower()
filter_dt=df_train.groupby('label').count()['tweet']

#Removing all tweets with less than 15 samples to train from 
filter_dt=filter_dt.loc[filter_dt.values>15].index
df_train=df_train[df_train['label'].isin(filter_dt)]
df_train.reset_index(inplace=True)
filter_dt=df_train.groupby('label').count()['tweet']

#Removing all labels from test which are not there in training
df_test=df_test[df_test['label'].isin(df_train.label.unique())]
df_test.reset_index(inplace=True)

In [25]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52460 entries, 0 to 52459
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   52460 non-null  int64 
 1   tweet   52460 non-null  object
 2   label   52460 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [26]:
df_train.head()

Unnamed: 0,index,tweet,label
0,0,يا من أناديها ويخنقني البكاء ويكاد صمت الدمع ...,ar
1,1,فيه فرق بين اهل غزة اللى مطحونين من ناحيتين وب...,ar
2,2,ﻋﻦ ﺍﻟﻠﺤﻈﺔ اﻟﺤﻠﻮﺓﺓ ﺍﻟﻠﻲ ﺑﺘﻐﻤﺾ ﻓﻴﻬﺎ ﻋﻴﻨﻴﻚ ﺑﺘﻔﻜﺮ ...,ar
3,3,يا ابو سلو عرفتني,ar
4,4,ب50 ريال أكفل معتمر في رمضان ، ولك بإذن الله م...,ar


In [27]:
df_train.label.unique()

array(['ar', 'ca', 'de', 'el', 'en', 'es', 'fa', 'fr', 'he', 'hi', 'id',
       'it', 'ja', 'ko', 'ms', 'nl', 'pl', 'pt', 'ru', 'sr', 'sv', 'th',
       'tl', 'tr', 'uk', 'und', 'vi', 'zh-CN'], dtype=object)

Retrieving the tweets and labels from the training and the test set.

In [28]:
X_train = df_train.tweet
y_train = df_train.label
X_test = df_test.tweet
y_test = df_test.label

print('Training tweet shape: ', X_train.shape)
print('Training labels shape: ', y_train.shape)
print('Test tweet shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Training tweet shape:  (52460,)
Training labels shape:  (52460,)
Test tweet shape:  (13217,)
Test labels shape:  (13217,)


Data Cleaning - removing all the unnecessary data from the text

In [29]:
def preprocessor(text):
  
  text = re.sub(r"http\S+", "", text)       ##Removing URL
  text = re.sub('@[^\s]+','',text)          ##Removing @Username
  text = re.sub('[\W]+', ' ', text.lower()) ##Removing NonCharacter eg.emojis
  text = re.sub('<[^>]*>', '', text)        ##Removing entire HTML
  text = re.sub(r'[0-9]+', '', text)        ## Removing numbers
  
  return text

#Applying the text cleaning to the train data
X_train = X_train.apply(preprocessor)
X_train[0:30] #Checking the data 

0     يا من أناديها ويخنقني البكاء ويكاد صمت الدمع أ...
1     فيه فرق بين اهل غزة اللى مطحونين من ناحيتين وب...
2     ﻋﻦ ﺍﻟﻠﺤﻈﺔ اﻟﺤﻠﻮﺓﺓ ﺍﻟﻠﻲ ﺑﺘﻐﻤﺾ ﻓﻴﻬﺎ ﻋﻴﻨﻴﻚ ﺑﺘﻔﻜﺮ ...
3                                     يا ابو سلو عرفتني
4     ب ريال أكفل معتمر في رمضان ولك بإذن الله مثل أ...
5     توجيه كيفية تثبيت البرامج الثابتة rom التحميل ...
6                            وأنه هو أغنى وأقنى النجم  
7     اللهم قدر لنا الفرح بكل اشكاله انت الكريم الذي...
8      غزه_تحت_القصف داعش أخواني حيل عندكم بالمدنيين...
9      يعلمون ظاهرا من الحياة الدنيا وهم عن الآخرة ه...
10               افضل كتاب قرأته هو أمي ابراهام لنكولن 
11    ولأن ه م م لائ ك ة ص غار ن ع ش ق ات كاءة رؤوس ...
12        خ لاصة الح ب هي ت فكر بقلبهآ وهو ي فكر بعقله 
13    جميل آن يفهمك من تحبب ويخآف عليك و يغآر عليكك ...
14    حتى الندم على المعصيه تؤجر عليه سبحانك يالله م...
15     اها يا بيبي والله اتهرست علي تويتر و ع الفيس ...
16     لا يقاتلونكم جميعا إلا في قرى محصنة أو من ورا...
17          طبت منك نهائيا بس كلي رجاء ما ترجع م

Checking the shape and assigning new variables to train and test data to be used for further processing of data

In [30]:
print(X_train.shape)
print(y_train.shape)

X_train_n = X_train
y_train_n = y_train
X_test_n = X_test
y_test_n = y_test


(52460,)
(52460,)


Preparing the final data for the model


1.   Tokenizing text data
2.   Encoding labels
3.   Finding the maximum length of the data and vocab size  for the input shape of the model input layer
4.   Printing the details




In [31]:
from keras.preprocessing.text import Tokenizer                    
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout,MaxPooling1D, Conv1D, Flatten , Embedding
from keras.preprocessing import text, sequence
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras import utils

#Calulating the length of the doc to be fed into the input layer of the model
def maxilen(lines):
	return max([len(s.split()) for s in lines])

#Fitting the dataTokenizer
def Generatetokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

#Creating the tokenizer
tokenizer = Generatetokenizer(X_train_n[0:])

#Encoding Text
def text_encoder(tokenizer, lines, length):
	
	encoded = tokenizer.texts_to_sequences(lines)
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded

#Calculating the maximum document length for the input shape for the model input
length = maxilen(X_train_n[0:])

#Calculating the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

print('Maximum document length: %d' % length)
print('Vocabulary size of the data: %d' % vocab_size)

#Encoding the data
trainX = text_encoder(tokenizer, X_train_n[0:], length)
testX = text_encoder(tokenizer, X_test_n[0:], length)
print(trainX.shape, testX.shape)

encoder = LabelEncoder()
encoder.fit(y_train_n)
y_trainning_labeled = encoder.transform(y_train_n)
y_testing_labeled = encoder.transform(y_test_n)

num_classes = np.max(y_trainning_labeled) + 1
y_trainning_labeled = utils.to_categorical(y_trainning_labeled, num_classes)
y_testing_labeled = utils.to_categorical(y_testing_labeled, num_classes)


Maximum document length: 1224
Vocabulary size of the data: 102072
(52460, 1224) (13217, 1224)


Splitting the data to 90/10 - test/validation set

In [32]:
from sklearn.model_selection import train_test_split
X_train_n , X_val , y_train_n , y_val = train_test_split (trainX, y_trainning_labeled, test_size= 0.1,random_state=42)

In [36]:
X_train_n.shape

(47214, 1224)

Creating model using Keras and implementing different layers 


In [33]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=length))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(num_classes))
model.add(Activation('softmax'))


print(model.summary())



Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1224, 100)         10207200  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1222, 128)         38528     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 611, 128)          0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 611, 128)          0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 78208)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 28)                2189852   
_________________________________________________________________
activation_4 (Activation)    (None, 28)               

Fitting the model to split train data 

In [41]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
history = model.fit(X_train_n, y_train_n,
                    epochs=5,batch_size=32)  

# save the model
model.save('model.h5')

Epoch 1/5
Epoch 2/5

KeyboardInterrupt: ignored