In [1]:
# Essentials
import pandas as pd
import numpy as np

# Sk learn utilities
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Clasification algorithms
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

#lstm utilities
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder

In [18]:
tweets_df = pd.read_csv('3.1-labelled-tweets.csv')
tweets_df.head()


Unnamed: 0,tweets,cleaned_tweets,label
0,जुम्राको उत्पति को कथा त कास्टिङ मात्र हो हेर्...,जु म् रा उत्पति कथा स्ट िङ हेर्दै स दुनिया ुव...,0
1,स्याङ्जाको पुतलीबजार नगरपालिका गण्डकी प्रदेशको...,स्याङ्जा पुतलीबजार नगरपालिका गण्डकी प्रदेश भि ...,1
2,फेक मिडिया गरेर दुनियाँमा अफबाह फैलाउनेलाई भाट...,फे क मिडिया दुनियाँ अफ बाह फैल ाउने भा टा हान ...,0
3,चाइनाने कोरोना फैलाउछ र किटमास्क बेच्छ अमेरिका...,चाइना ने रो ना फैल ाउछ ट स्क बे च्छ अमेरिका य...,0
4,जोर र बिजोर ले अर्थतन्त्र लाईन मा आउने भए तेत्...,जोर बि जोर अर्थतन्त्र लाईन आउने ते त्रो रो ना...,0


In [19]:
features = tweets_df['cleaned_tweets']
labels = tweets_df['label'].astype("int")

In [20]:
feature_train, feature_test, label_train, label_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [21]:
print("The no of Training data: ", feature_train.shape[0])
print("The no of Testing data:",feature_test.shape[0])

The no of Training data:  21431
The no of Testing data: 7144


In [6]:
svm_model = Pipeline([('vect', TfidfVectorizer()), ('model', SVC())])
lr_model = Pipeline([('vect', TfidfVectorizer()), ('model', LogisticRegression())])
nb_model = Pipeline([('vect', TfidfVectorizer()), ('model', BernoulliNB())])
xgb_model = Pipeline([('vect', TfidfVectorizer()), ('model', XGBClassifier(objective="binary:logistic", random_state=42))])
mlp_model = Pipeline([('vect', TfidfVectorizer()), ('model', MLPClassifier(random_state=42))])
kn_model = Pipeline([('vect', TfidfVectorizer()), ('model', KNeighborsClassifier())])
dt_model = Pipeline([('vect', TfidfVectorizer()), ('model', DecisionTreeClassifier())])
rf_model = Pipeline([('vect', TfidfVectorizer()), ('model', RandomForestClassifier())])


In [7]:
models = [svm_model, lr_model, nb_model, xgb_model, mlp_model, kn_model, dt_model, rf_model]
model_notations = {0:'# Support Vector Machine', 1:'# Linear Regression', 2:'# Nayve Bayes', 3:'XGB classifier', 4:'# MLP classifier', 5:'# K Neighbors', 6:'# Decision Tree', 7:'# Random Forest'}
nepali_text=[["हामिले कोरोना लागेको मान्छेलाई सहयोग गर्नुपर्छ"],["कोरोना ले कैयों को जागिर खोसिदियो"],["कोरोनाले ज्यान लियो"]]
for i, model in enumerate(models):
    print("================================================================================================================")
    print(f"{model_notations[i]} \n---------------------------------")
    model.fit(feature_train,label_train)
    prediction = model.predict(feature_test)
    print(f'Accuracy:  {accuracy_score(prediction, label_test)}')
    print("Classification report \n---------------------------------")
    print(classification_report(label_test, prediction))
    
    print("Manual Prediction \n---------------------------------")
    for text in nepali_text:
        if np.squeeze(model.predict(text))==1: 
            print("%s: "%(np.squeeze(text)),"Positive")
        else:
            print("%s: "%(np.squeeze(text)),"Negative")
    print("\n================================================================================================================")

# Support Vector Machine 
---------------------------------
Accuracy:  0.7842945128779395
Classification report 
---------------------------------
              precision    recall  f1-score   support

           0       0.79      0.74      0.76      3386
           1       0.78      0.83      0.80      3758

    accuracy                           0.78      7144
   macro avg       0.79      0.78      0.78      7144
weighted avg       0.78      0.78      0.78      7144

Manual Prediction 
---------------------------------
हामिले कोरोना लागेको मान्छेलाई सहयोग गर्नुपर्छ Positive
कोरोना ले कैयों को जागिर खोसिदियो Negative
कोरोनाले ज्यान लियो Negative

# Linear Regression 
---------------------------------
Accuracy:  0.7516797312430011
Classification report 
---------------------------------
              precision    recall  f1-score   support

           0       0.75      0.71      0.73      3386
           1       0.75      0.79      0.77      3758

    accuracy                          

In [22]:
#Variables for lstm
data_length = len(tweets_df)
max_input_length = 48
size_of_batch = 64
epos_count = 3

In [23]:
tokenizer = Tokenizer(data_length)
tokenizer.fit_on_texts(feature_train)
feature_train = tokenizer.texts_to_sequences(feature_train)
feature_train = pad_sequences(feature_train, maxlen=max_input_length, truncating='post', padding='post')
# feature_train[0], len(feature_train[0])
# data_length

In [24]:
feature_test = tokenizer.texts_to_sequences(feature_test)
feature_test = pad_sequences(feature_test, maxlen=max_input_length, truncating='post', padding='post')
# feature_test[0], len(feature_test[0])


In [25]:
print(feature_train.shape, label_train.shape)
print(feature_test.shape, label_test.shape)

(21431, 48) (21431,)
(7144, 48) (7144,)


In [26]:
le = LabelEncoder()
label_train = le.fit_transform(label_train)
label_test = le.transform(label_test)

In [27]:
lstm_model = Sequential()
# feature_train, feature_test = feature_train.toarray(), feature_test.toarray()
lstm_model.add(Embedding(data_length, 100, input_length = max_input_length, trainable=True))
lstm_model.add(LSTM(100, dropout=0.1, return_sequences=True))
lstm_model.add(LSTM(100, dropout=0.1))
lstm_model.add(Dense(1, activation="sigmoid"))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
lstm_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 48, 100)           2857500   
                                                                 
 lstm_2 (LSTM)               (None, 48, 100)           80400     
                                                                 
 lstm_3 (LSTM)               (None, 100)               80400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 3,018,401
Trainable params: 3,018,401
Non-trainable params: 0
_________________________________________________________________


In [29]:
history = lstm_model.fit(feature_train, label_train, validation_data = (feature_test, label_test), epochs=epos_count, batch_size=size_of_batch)
#jati epos dherai bhayo teti ramro hunchha like 10000 atleast

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [30]:
scores = lstm_model.evaluate(feature_test, label_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 81.47%


In [35]:
for text in nepali_text:
    text_tokened = tokenizer.texts_to_sequences(text)
    text_padded = pad_sequences(text_tokened, maxlen=max_input_length, truncating='post', padding='post')
    if np.squeeze(lstm_model.predict(text_padded))>0.5: 
        print("\n%s: "%(np.squeeze(text)),"Positive\n")
    else:
        print("\n%s: "%(np.squeeze(text)),"Negative\n")


हामिले कोरोना लागेको मान्छेलाई सहयोग गर्नुपर्छ:  Positive


कोरोना ले कैयों को जागिर खोसिदियो:  Negative


कोरोनाले ज्यान लियो:  Negative



In [32]:
lstm_prediction = lstm_model.predict(feature_test)
lstm_prediction = (lstm_prediction>0.5)

print('accuracy %s' % accuracy_score(prediction, label_test))
print(classification_report(label_test, prediction))
# print(prediction.value_count())

accuracy 0.7637178051511758
              precision    recall  f1-score   support

           0       0.79      0.68      0.73      3386
           1       0.75      0.84      0.79      3758

    accuracy                           0.76      7144
   macro avg       0.77      0.76      0.76      7144
weighted avg       0.77      0.76      0.76      7144

