# Exercise 01 - Part 02

## Importing all the libraries

In [None]:
import csv
import re
from io import StringIO
import requests
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

#### Downloading and loading the datafiles

In [None]:
url_train_dev = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTOZ2rC82rhNsJduoyKYTsVeH6ukd7Bpxvxn_afOibn3R-eadZGXu82eCU9IRpl4CK_gefEGsYrA_oM/pub?gid=1863430984&single=true&output=tsv'
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

In [None]:
def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    df.columns = ['tweet', 'label']
    return df

In [None]:
df_train_dev = load_dataset(url_train_dev)
df_test = load_dataset(url_test)
#X_test = df_test['tweet']  


In [None]:
pd.options.display.max_rows = 100

### Inspecting Corpus

In [None]:
df_train_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52675 entries, 0 to 52674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   52675 non-null  object
 1   label   52675 non-null  object
dtypes: object(2)
memory usage: 823.2+ KB


In [None]:
df_train_dev.head()

Unnamed: 0,tweet,label
0,يا من أناديها ويخنقني البكاء ويكاد صمت الدمع ...,ar
1,فيه فرق بين اهل غزة اللى مطحونين من ناحيتين وب...,ar
2,ﻋﻦ ﺍﻟﻠﺤﻈﺔ اﻟﺤﻠﻮﺓﺓ ﺍﻟﻠﻲ ﺑﺘﻐﻤﺾ ﻓﻴﻬﺎ ﻋﻴﻨﻴﻚ ﺑﺘﻔﻜﺮ ...,ar
3,يا ابو سلو عرفتني,ar
4,ب50 ريال أكفل معتمر في رمضان ، ولك بإذن الله م...,ar


In [None]:
df_train_dev.label.unique()

array(['ar', 'ar_LATN', 'az', 'bg', 'bn', 'bs', 'ca', 'cs', 'cy', 'da',
       'de', 'dv', 'el', 'en', 'es', 'et', 'fa', 'fi', 'fr', 'gl', 'ha',
       'he', 'hi', 'hi-Latn', 'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it',
       'ja', 'ja_LATN', 'jv', 'km', 'ko', 'ko_LATN', 'ms', 'ne', 'nl',
       'no', 'pl', 'ps', 'ps_LATN', 'pt', 'ro', 'ru', 'si', 'sl', 'sq',
       'sr', 'su', 'sv', 'sw', 'ta', 'ta_LATN', 'th', 'tl', 'tn', 'tr',
       'uk', 'und', 'ur', 'ur_LATN', 'vi', 'wo', 'xh', 'zh-CN', 'zh-TW'],
      dtype=object)

In [None]:
df_train_dev.label.value_counts()

en         18508
ja         10421
es          5930
und         4537
id          3006
pt          2878
ar          2199
ru           978
fr           946
tr           669
th           462
ko           458
it           339
tl           320
nl           182
de           171
ms           119
pl            93
sv            54
el            28
he            27
zh-CN         25
sr            22
ca            22
fa            18
vi            16
uk            16
hi            16
hu            15
hi-Latn       15
fi            15
ur_LATN       12
ro            12
ar_LATN       12
no            11
su            10
zh-TW         10
jv            10
sq             9
ta             9
bn             8
da             7
ur             7
sw             6
hr             5
ne             5
cs             4
bs             4
gl             3
et             2
ht             2
bg             2
hy             2
km             2
sl             2
ta_LATN        1
si             1
ps_LATN        1
is            

# Pre-processing the training and test data

## Removing tweets of those language whose frequency is less than 5 

### Since there are very few data of these language, it doesn't help much with trainig and downgrades the efficiency of the model too.

In [None]:
df_train_dev.isnull().values.any()

False

In [None]:
# threshold = 5 # Anything that occurs less than this will be removed.
# for col in df_train_dev.label:
#     value_counts = df_train_dev['label'].value_counts() # Specific column 
#     to_remove = value_counts[value_counts < threshold].index
#     df_train_dev['label'].replace(to_remove, np.nan, inplace=True)

## Removing emojis, links, hashtags and mentions from the training and testing datasets

In [None]:
value_counts = df_train_dev['label'].value_counts().reset_index()

In [None]:
value_counts.tail()

Unnamed: 0,index,label
64,ps,1
65,dv,1
66,az,1
67,ja_LATN,1
68,wo,1


In [None]:
type(value_counts['label'][0])

numpy.int64

In [None]:
value_counts_new = value_counts[value_counts['label'] > 5]

In [None]:
value_counts.shape, value_counts_new.shape

((69, 2), (44, 2))

In [None]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
                           "]+", flags=re.UNICODE)

df_train_dev['testFeature'] = 12;

for i in range(len(df_train_dev['tweet'])):
    df_train_dev['tweet'][i] = " ".join([word for word in df_train_dev['tweet'][i].split()
                                 if 'http' not in word and '#' not in word and '@' not in word])
    
    df_train_dev['tweet'][i] = emoji_pattern.sub(r'',df_train_dev['tweet'][i])



for i in range(len(df_test['tweet'])):
    df_test['tweet'][i] = " ".join([word for word in df_test['tweet'][i].split()
                                 if 'http' not in word and '#' not in word and '@' not in word])
    
    df_test['tweet'][i] = emoji_pattern.sub(r'',df_test['tweet'][i])

In [None]:
df_train_dev.isnull().values.any()

False

## Removing other special characters and numeric values

In [None]:
df_train_dev['tweet'] = df_train_dev['tweet'].apply(lambda x: re.sub('[!@#$:).;,?&(\{\}\[\]\d+]', '', x.lower()))
df_train_dev['tweet'] = df_train_dev['tweet'].apply(lambda x: re.sub('  ', ' ', x))

df_test['tweet'] = df_test['tweet'].apply(lambda x: re.sub('[!@#$:).;,?&(\{\}\[\]\d+]', '', x.lower()))
df_test['tweet'] = df_test['tweet'].apply(lambda x: re.sub('  ', ' ', x))

In [None]:
i=0;
for column in df_train_dev['tweet']:
  print(column);  
  i+=1;
  if i==50:
    break;

يا من أناديها ويخنقني البكاء ويكاد صمت الدمع أن يتكلما يا قلبي الدامي وآه وأين ومن فاضت على عواطفاً وترحما
فيه فرق بين اهل غزة اللى مطحونين من ناحيتين وبين حماس ؟ هنفهم وﻻ نبدا من ا ب ت
ﻋﻦ ﺍﻟﻠﺤﻈﺔ اﻟﺤﻠﻮﺓﺓ ﺍﻟﻠﻲ ﺑﺘﻐﻤﺾ ﻓﻴﻬﺎ ﻋﻴﻨﻴﻚ ﺑﺘﻔﻜﺮ ف ﺣﺎﺟﺎﺕ ﺣﻠﻮﺓ ﺑﺘﺘﻤﻨﺎﻫﺎ ﻭﺗﻔﺘﺢ ﻋﻴﻨﻴﻚ ﺑﻀﺤﻜﺔ ﺟﻤﻴﻠﺔ ﻣﻊ ﻛﻠﻤﺔ
يا ابو سلو عرفتني
ب ريال أكفل معتمر في رمضان ، ولك بإذن الله مثل أجر عمرته وتفطيره وصلواته
توجيه كيفية تثبيت البرامج الثابتة rom التحميل لسامسونج
وأنه هو أغنى وأقنى النجم
اللهم قدر لنا الفرح بكل اشكاله ، انت الكريم الذي لا حدود لعطائه "
داعش أخواني حيل عندكم بالمدنيين نحر وجز رؤوس 
يعلمون ظاهرا من الحياة الدنيا وهم عن الآخرة هم غافلون الروم
• افضل كتاب قرأته هو أمي ابراهام لنكولن 
ولأنّهُم مَلائِكَةٌ صِغارنَعْشَقُ اتِكاءة رؤوسِهِم على أكتافِنا ورائحَة أطرافِهِم ومُدُنُ الطمأنينة بـوجوههم
خُلاصة الحُب هي تُفكر بقلبهآ وهو يُفكر بعقلهِ 
جميل آن يفهمك منَ تحبب ويخآفَ عليك و يغآر عليككَ بشدهه و ان لآ يتركك لك وقت آن تحآدث شخص غيرهه فيعيش كل دقيقه معكك
حتى الندم على المعصيه تؤجر عليه - سبحانك يالله ما أرحمك 
اه

## Splitting the dataset into training & verification datasets

In [None]:
X = df_train_dev['tweet']  # this time we want to look at the text
y = df_train_dev['label']


X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.10, random_state=42)

In [None]:
X_Test = df_test['tweet']
Y_Test = df_test['label']

In [None]:
X_train.shape,X_validate.shape,y_train.shape,y_validate.shape,

((47407,), (5268,), (47407,), (5268,))

In [None]:
X_train.isnull().values.any(),X_validate.isnull().values.any(),y_train.isnull().values.any(),

(False, False, False)

### Creating Pipeline for MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
text_clf_MLP = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MLPClassifier(hidden_layer_sizes=(600),early_stopping=True)),
])

text_clf_MLP.fit(X,y)


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=True, epsilon=1e-08,
                               hidden_layer_sizes=600, learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
      

### Predicting the accuracy on Validation Data

In [None]:
pred_Validation_MLP = text_clf_MLP.predict(X_validate)

In [None]:
accuracy_score(y_validate,pred_Validation_MLP)

0.9265375854214123

### Predicting the accuracy on Test Data

In [None]:
pred_Test_MLP = text_clf_MLP.predict(X_Test)

In [None]:
accuracy_score(df_test['label'],pred_Test_MLP)

0.8496121695910837

In [None]:
pred_Value = pd.DataFrame(pred_Test_MLP, columns = ['predictedValue'])

In [None]:
pred_Value['actualValue'] = df_test['label']

In [None]:
pred_Value.head(10)

Unnamed: 0,predictedValue,actualValue
0,en,en
1,ja,und
2,en,en
3,es,es
4,ja,ja
5,en,en
6,en,en
7,en,en
8,ja,ja
9,id,id


In [None]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
pd.pivot_table(pred_Value, index=['actualValue'], columns=['predictedValue'], aggfunc=len, fill_value=0)

predictedValue,ar,de,en,es,fr,id,it,ja,ko,nl,pl,pt,ru,sv,th,tl,tr,und
actualValue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ar,472,0,0,0,0,0,0,53,0,0,0,0,0,0,0,0,0,4
ar_LATN,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0
az,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
bg,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
bs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
ca,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1
cs,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
da,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
de,0,39,1,1,0,0,0,4,0,0,0,0,0,0,0,0,0,5
el,0,0,1,1,0,0,0,4,0,0,0,0,0,0,0,0,0,5
