In [1]:
import pandas as pd
from inltk.inltk import tokenize

In [2]:
tweets_df = pd.read_csv('3.1-labelled-tweets.csv')
tweets_df.head()

Unnamed: 0,tweets,cleaned_tweets,label
0,जुम्राको उत्पति को कथा त कास्टिङ मात्र हो हेर्...,जु म् रा उत्पति कथा स्ट िङ हेर्दै स दुनिया ुव...,0
1,स्याङ्जाको पुतलीबजार नगरपालिका गण्डकी प्रदेशको...,स्याङ्जा पुतलीबजार नगरपालिका गण्डकी प्रदेश भि ...,1
2,फेक मिडिया गरेर दुनियाँमा अफबाह फैलाउनेलाई भाट...,फे क मिडिया दुनियाँ अफ बाह फैल ाउने भा टा हान ...,0
3,चाइनाने कोरोना फैलाउछ र किटमास्क बेच्छ अमेरिका...,चाइना ने रो ना फैल ाउछ ट स्क बे च्छ अमेरिका य...,0
4,जोर र बिजोर ले अर्थतन्त्र लाईन मा आउने भए तेत्...,जोर बि जोर अर्थतन्त्र लाईन आउने ते त्रो रो ना...,0


In [3]:
features = tweets_df['cleaned_tweets']
labels = tweets_df['label'].astype("int")

In [4]:
from sklearn.model_selection import train_test_split
feature_train, feature_test, label_train, label_test = train_test_split(features, labels, test_size=0.25, random_state=42)

In [5]:
print("The no of Training data: ", feature_train.shape[0])
print("The no of Testing data:",feature_test.shape[0])

The no of Training data:  21431
The no of Testing data: 7144


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
svm_model = Pipeline([('vect', TfidfVectorizer()), ('model', SVC())])

In [7]:
svm_model.fit(feature_train,label_train)
svm_prediction = svm_model.predict(feature_test)

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print('accuracy %s' % accuracy_score(svm_prediction, label_test))
print(classification_report(label_test, svm_prediction))

accuracy 0.7842945128779395
              precision    recall  f1-score   support

           0       0.79      0.74      0.76      3386
           1       0.78      0.83      0.80      3758

    accuracy                           0.78      7144
   macro avg       0.79      0.78      0.78      7144
weighted avg       0.78      0.78      0.78      7144



In [9]:
import numpy as np
nepali_text=[["हामिले कोरोना लागेको मान्छेलाई सहयोग गर्नुपर्छ"],["कोरोना ले कैयों को जागिर खोसिदियो"],["कोरोनाले ज्यान लियो"]]
for text in nepali_text:
    if np.squeeze(svm_model.predict(text))==1: 
        print("%s"%(np.squeeze(text)),"Positive")
    else:
        print("%s"%(np.squeeze(text)),"Negative")

हामिले कोरोना लागेको मान्छेलाई सहयोग गर्नुपर्छ Positive
कोरोना ले कैयों को जागिर खोसिदियो Negative
कोरोनाले ज्यान लियो Negative


In [10]:
from sklearn.naive_bayes import BernoulliNB
nb_model = Pipeline([('vect', TfidfVectorizer()), ('model', BernoulliNB())])
nb_model.fit(feature_train,label_train)
nb_prediction = nb_model.predict(feature_test)
print('accuracy %s' % accuracy_score(nb_prediction, label_test))
print(classification_report(label_test, nb_prediction))

accuracy 0.7113661814109743
              precision    recall  f1-score   support

           0       0.68      0.73      0.70      3386
           1       0.74      0.70      0.72      3758

    accuracy                           0.71      7144
   macro avg       0.71      0.71      0.71      7144
weighted avg       0.71      0.71      0.71      7144



In [11]:
from sklearn.linear_model import LogisticRegression
lr_model = Pipeline([('vect', TfidfVectorizer()), ('model', LogisticRegression())])
lr_model.fit(feature_train,label_train)
lr_prediction = lr_model.predict(feature_test)
print('accuracy %s' % accuracy_score(lr_prediction, label_test))
print(classification_report(label_test, lr_prediction))

accuracy 0.7516797312430011
              precision    recall  f1-score   support

           0       0.75      0.71      0.73      3386
           1       0.75      0.79      0.77      3758

    accuracy                           0.75      7144
   macro avg       0.75      0.75      0.75      7144
weighted avg       0.75      0.75      0.75      7144



In [12]:
from xgboost import XGBClassifier
xgb_model = Pipeline([('vect', TfidfVectorizer()), ('model', XGBClassifier(objective="binary:logistic", random_state=42))])
xgb_model.fit(feature_train,label_train)
xgb_prediction = xgb_model.predict(feature_test)
print('accuracy %s' % accuracy_score(xgb_prediction, label_test))
print(classification_report(label_test, xgb_prediction))

accuracy 0.7472004479283315
              precision    recall  f1-score   support

           0       0.77      0.67      0.72      3386
           1       0.73      0.82      0.77      3758

    accuracy                           0.75      7144
   macro avg       0.75      0.74      0.74      7144
weighted avg       0.75      0.75      0.75      7144



In [13]:
from sklearn.neural_network import MLPClassifier
mlp_model = Pipeline([('vect', TfidfVectorizer()), ('model', MLPClassifier())])
mlp_model.fit(feature_train,label_train)
mlp_prediction = mlp_model.predict(feature_test)
print('accuracy %s' % accuracy_score(mlp_prediction, label_test))
print(classification_report(label_test, mlp_prediction))

accuracy 0.7667973124300111
              precision    recall  f1-score   support

           0       0.76      0.75      0.75      3386
           1       0.78      0.78      0.78      3758

    accuracy                           0.77      7144
   macro avg       0.77      0.77      0.77      7144
weighted avg       0.77      0.77      0.77      7144



In [14]:
#lstm
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [23]:
# import numpy as np
# from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
def tfidf_features(feature_train, feature_test):
    feature_train_tfidf = tfidf_vectorizer.fit_transform(feature_train).toarray()
    feature_test_tfidf = tfidf_vectorizer.transform(feature_test).toarray()
    return feature_train_tfidf,feature_test_tfidf,tfidf_vectorizer.vocabulary_
feature_train_tfidf,feature_test_tfidf,vocabulary=tfidf_features(feature_train,feature_test)
model = Sequential()
model.add(Embedding(30000, 32))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(feature_train_tfidf, label_train, epochs=20, batch_size=64)
scores = model.evaluate(feature_test_tfidf, label_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 100)         3000000   
                                                                 
 conv1d_4 (Conv1D)           (None, None, 32)          9632      
                                                                 
 max_pooling1d_4 (MaxPooling  (None, None, 32)         0         
 1D)                                                             
                                                                 
 lstm_4 (LSTM)               (None, 128)               82432     
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                      

ValueError: in user code:

    File "C:\Users\PREDATOR\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\PREDATOR\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\PREDATOR\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\PREDATOR\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\PREDATOR\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\PREDATOR\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\backend.py", line 6106, in pool2d
        x = tf.compat.v1.nn.max_pool(

    ValueError: Exception encountered when calling layer "max_pooling1d_4" (type MaxPooling1D).
    
    Negative dimension size caused by subtracting 2 from 1 for '{{node sequential_4/max_pooling1d_4/MaxPool}} = MaxPool[T=DT_FLOAT, data_format="NHWC", explicit_paddings=[], ksize=[1, 2, 1, 1], padding="VALID", strides=[1, 2, 1, 1]](sequential_4/max_pooling1d_4/ExpandDims)' with input shapes: [?,1,1,32].
    
    Call arguments received by layer "max_pooling1d_4" (type MaxPooling1D):
      • inputs=tf.Tensor(shape=(None, 1, 32), dtype=float32)


In [20]:
prediction = model.predict(feature_test_tfidf)



In [21]:
# print('accuracy %s' % accuracy_score(prediction, label_test))
# print(classification_report(label_test, prediction))
# scores
prediction

array([[0.524761],
       [0.524761],
       [0.524761],
       [0.524761],
       ...,
       [0.524761],
       [0.524761],
       [0.524761],
       [0.524761]], dtype=float32)