In [1]:
import pandas as pd
import re
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt 

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
dataset = pd.read_json('/content/train.jsonl', lines=True)

In [0]:
data = []

for i in range(dataset.shape[0]):
    #print('i is', i)
    claim = dataset.iloc[i,0]
    #print('claim',claim)

    # remove non alphanumeric characters
    claim = re.sub('[^A-Za-z0-9]', ' ', claim)

    # make words lowercase, because Go and go will be considered as two words
    claim = claim.lower()

    # tokenising
    tokenized_claim = wt(claim)

    # remove stop words and stemming
    claim_processed = []
    for word in tokenized_claim:
        if word not in set(stopwords.words('english')):
            claim_processed.append(stemmer.stem(word))

    claim_text = " ".join(claim_processed)
    #print('claim_text is',claim_text)
    data.append(claim_text)
    #print('data is',data)


In [4]:
# creating the feature matrix with 1000 most of the frequent words occuring
from sklearn.feature_extraction.text import CountVectorizer
# matrix gets created with the claims as the rows and the selecetd features as the columns
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(data).toarray()
y = dataset.iloc[:, 3]
print('x shape',X.shape)
print('y shape',y.shape)

# splitting data to create train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# training Naive Bayes model
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# predicting labels
y_pred = classifier.predict(X_test)

# generating Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print('Confusion Matrix: ',cm)
print('Classification Report: ',cr)

# calculating accuracy
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy Score: ', accuracy)

x shape (145449, 1000)
y shape (145449,)
Confusion Matrix:  [[3084 2663 3205]
 [1512 3717 2254]
 [4815 5706 9407]]
Classification Report:                   precision    recall  f1-score   support

NOT ENOUGH INFO       0.33      0.34      0.34      8952
        REFUTES       0.31      0.50      0.38      7483
       SUPPORTS       0.63      0.47      0.54     19928

      micro avg       0.45      0.45      0.45     36363
      macro avg       0.42      0.44      0.42     36363
   weighted avg       0.49      0.45      0.46     36363

Accuracy Score:  0.44572780023650416
