In [1]:
import numpy as np
import timeit
import docx
import csv
import re
import string
from pattern.text.en import lemma
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
labels = []

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [3]:
def TextPreprocessing():
    # labels = []
    Data = []
    with open('olid-training-v1.tsv', 'r', encoding='utf-8') as text:
        reader = csv.reader(text, delimiter='\t')
        global labels
        labels = reader.__next__()
        for value in reader:
            Data.append(value)
    return Data

In [4]:
def Stop(Data):
    # removing stop words
    NoStopData = []
    stop = stopwords.words('english')
    for i in range(0, len(Data)):
        temp = []
        for word in Data[i][1].split(" "):
            if word not in stop:
                temp.append(word.lower())

        NoStopData.append(temp)
    return NoStopData

In [5]:
def Emojis(NoStopData):
    # removing emojis
    NoEmoStopData = []
    pattern = re.compile("["
                         u"\U0001F600-\U0001F64F"
                         u"\U0001F300-\U0001F5FF"
                         u"\U0001F680-\U0001F6FF"
                         u"\U0001F1E0-\U0001F1FF"
                         u"\U00002702-\U000027B0"
                         u"\U000024C2-\U0001F251"
                         u"\U0001f926-\U0001f937"
                         u"\U00010000-\U0010ffff"
                         u"\u2640-\u2642"
                         u"\u2600-\u2B55"
                         u"\u200d"
                         u"\u23cf"
                         u"\u23e9"
                         u"\u231a"
                         u"\ufe0f"
                         u"\u3030"
                         "]+", flags=re.UNICODE)

    for i in range(0, len(NoStopData)):
        temp = []
        for word in NoStopData[i]:
            temp.append(pattern.sub(r'', word))
        NoEmoStopData.append(temp)
    return NoEmoStopData

In [6]:
def ETC(NoEmoStopData):
    # removing punctuations, hashtags and @
    NoPuncEmoStopData = []
    prefix = ['@', '#']
    unnecessary = ['amp', 'url']
    for i in range(0, len(NoEmoStopData)):
        temp = []
        for word in NoEmoStopData[i]:
            for sep in string.punctuation:
                if sep not in prefix:
                    word = word.replace(sep, ' ')  # removing all punctuations other than @ or #

            if word != '':
                words = word.split()
                for w in words:
                    if w[0] not in prefix:  
                        if w not in unnecessary:  
                            temp.append(w.strip())

        NoPuncEmoStopData.append(temp)
    return NoPuncEmoStopData

In [7]:
def Digits(NoPuncEmoStopData):
    # removing digits
    NoDigitPuncEmoStopData = []
    for i in range(len(NoPuncEmoStopData)):
        temp = []
        for word in NoPuncEmoStopData[i]:
            if not word.isdigit():
                temp.append(word)
        NoDigitPuncEmoStopData.append(temp)
    return NoDigitPuncEmoStopData

In [14]:
def Lemmatization(Data, NoDigitPuncEmoStopData):
    # lemmatization
    FinalData = []
    i = 0

    for sentence in NoDigitPuncEmoStopData:
        newSentence = " ".join(sentence)
        lem = " "
        try:
            lem = " ".join([lemma(word) for word in newSentence.split()])
        except StopIteration:
            print("Error Happened")
        FinalData.append([lem, Data[i][2]])
        i += 1
    return FinalData

In [15]:
def TrainModel():
    global labels
    Data = TextPreprocessing()
    NoStopData = Stop(Data)
    NoEmoStopData = Emojis(NoStopData)
    NoPuncEmoStopData = ETC(NoEmoStopData)
    NoDigitPuncEmoStopData = Digits(NoPuncEmoStopData)
    data = Lemmatization(Data, NoDigitPuncEmoStopData)

    labels = labels[1:3]
    x = []
    y = []
    for i in range(0, len(data)):
        x.append(data[i][0])
        y.append(data[i][1])

    Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(x, y, test_size=0.25)

    Encoder = LabelEncoder()
    Train_Y = Encoder.fit_transform(Train_Y)
    Test_Y = Encoder.fit_transform(Test_Y)

    Tf_idfModel = TfidfVectorizer()
    Tf_idfModel.fit(x)
    Tfidf_Train_X = Tf_idfModel.transform(Train_X)
    Tfidf_Test_X = Tf_idfModel.transform(Test_X)
    return Tfidf_Train_X, Train_Y, Tfidf_Test_X, Test_Y, Tf_idfModel

In [19]:
def Naive(Tfidf_Train_X, Train_Y, Tfidf_Test_X, Test_Y):
    start = timeit.default_timer()
    NaiveModel = naive_bayes.MultinomialNB()
    NaiveModel.fit(Tfidf_Train_X, Train_Y)
    NaiveScore = NaiveModel.predict(Tfidf_Test_X)

    stop = timeit.default_timer()
    execution_time = stop - start

    print("Accuracy Score :", accuracy_score(NaiveScore, Test_Y)*100)
    print("Time required :", execution_time, "seconds")

In [22]:
TfidfTrainX, TrainY, TfidfTestX, TestY, Tf_idf = TrainModel()

In [23]:
Naive(TfidfTrainX, TrainY, TfidfTestX, TestY)

Accuracy Score : 71.54078549848943
Time required : 0.008760700000010502 seconds
