## Imports

In [None]:
!pip install sentence_transformers
!pip install --upgrade --no-cache-dir gdown

In [None]:
import torch 
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers.readers import InputExample
from sklearn.metrics import classification_report as clfr
from sentence_transformers import SentenceTransformer as STF

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Read preprocessed data

In [None]:
!gdown 1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS
!gdown 1_3e8jv8uG4zRkHEimaH1VwHxEc_5k2JX

Downloading...
From: https://drive.google.com/uc?id=1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS
To: /content/WELFake_Dataset.csv
100% 245M/245M [00:01<00:00, 220MB/s]
Downloading...
From: https://drive.google.com/uc?id=1pFFcunSiNS6PCGd9c_MvvPukJqZp1lHs
To: /content/fake_or_real_news.csv
100% 30.7M/30.7M [00:00<00:00, 115MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_3e8jv8uG4zRkHEimaH1VwHxEc_5k2JX
To: /content/data.csv
100% 12.6M/12.6M [00:00<00:00, 129MB/s]


In [None]:
data2 = pd.read_csv("/content/data.csv")
data = pd.read_csv("/content/WELFake_Dataset.csv")

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
data2.head()

In [None]:
data = data.dropna()
data2 = data2.dropna()

In [None]:
data = data[:14000]
data2 = data2[:2000]

### Train Test split (80:20)

In [None]:
x_train = data['text']
y_train = data['label']

In [None]:
x_test = data2['Body']
y_test = data2['Label']

#### train class and test class distribution 

In [None]:
label_train= np.unique(y_train,return_counts=True)
label_test= np.unique(y_test,return_counts=True)
print(f"Train distribution {label_train}, Test Distribuition {label_test}")

Train distribution (array([0, 1]), array([6700, 7300])), Test Distribuition (array([0, 1]), array([1056,  944]))


## Generate sentence embeddings

In [None]:
embedding_model = STF('bert-base-uncased')

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
x_train.isna().values.any()

False

In [None]:
x_train_vec = embedding_model.encode(x_train.to_numpy(), convert_to_tensor = True)

In [None]:
x_test_vec = embedding_model.encode(x_test.to_numpy(), convert_to_tensor = True)

In [None]:
x_train_vec.shape

torch.Size([14000, 768])

##ML models


In [None]:
x_train_vec = x_train_vec.cpu()
x_test_vec = x_test_vec.cpu()

### logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression as LGR

In [None]:
lr_clf = LGR(random_state=0).fit(x_train_vec,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
pred_values_lr_train = lr_clf.predict(x_train_vec)
pred_values_lr_test = lr_clf.predict(x_test_vec)

In [None]:
print(f" Logistic Regression performance on train data -> \n{clfr(y_train,pred_values_lr_train)}")
print("\n")
print(f" Logistic Regression performance on test data -> \n{clfr(y_test,pred_values_lr_test)}")

 Logistic Regression performance on train data -> 
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      6700
           1       0.96      0.95      0.95      7300

    accuracy                           0.95     14000
   macro avg       0.95      0.95      0.95     14000
weighted avg       0.95      0.95      0.95     14000



 Logistic Regression performance on test data -> 
              precision    recall  f1-score   support

           0       0.19      0.15      0.17      1056
           1       0.24      0.29      0.26       944

    accuracy                           0.22      2000
   macro avg       0.21      0.22      0.22      2000
weighted avg       0.21      0.22      0.21      2000



### K-Nearest Neighbors

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train_vec,y_train)

In [None]:
pred_values_knn_train = knn_clf.predict(x_train_vec)
pred_values_knn_test = knn_clf.predict(x_test_vec)


In [None]:
print(f" KNN performance on train data -> \n{clfr(y_train,pred_values_knn_train)}")
print("\n")
print(f" KNN Regression performance on test data -> \n{clfr(y_test,pred_values_knn_test)}")

 KNN performance on train data -> 
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      6700
           1       0.96      0.94      0.95      7300

    accuracy                           0.95     14000
   macro avg       0.95      0.95      0.95     14000
weighted avg       0.95      0.95      0.95     14000



 KNN Regression performance on test data -> 
              precision    recall  f1-score   support

           0       0.34      0.33      0.34      1056
           1       0.27      0.28      0.27       944

    accuracy                           0.31      2000
   macro avg       0.31      0.30      0.31      2000
weighted avg       0.31      0.31      0.31      2000



### Decision Tree

In [None]:
dec_clf = DecisionTreeClassifier(random_state=0)
dec_clf.fit(x_train_vec,y_train)

In [None]:
pred_values_dec_train = dec_clf.predict(x_train_vec)
pred_values_dec_test = dec_clf.predict(x_test_vec)

In [None]:
print(f" Decision tree performance on train data -> \n{clfr(y_train,pred_values_dec_train)}")
print("\n")
print(f" Decision performance on test data -> \n{clfr(y_test,pred_values_dec_test)}")

 Decision tree performance on train data -> 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6700
           1       1.00      1.00      1.00      7300

    accuracy                           1.00     14000
   macro avg       1.00      1.00      1.00     14000
weighted avg       1.00      1.00      1.00     14000



 Decision performance on test data -> 
              precision    recall  f1-score   support

           0       0.25      0.17      0.20      1056
           1       0.30      0.40      0.34       944

    accuracy                           0.28      2000
   macro avg       0.27      0.29      0.27      2000
weighted avg       0.27      0.28      0.27      2000



### SVM

In [None]:
svm_clf = SVC(C=0.8)
svm_clf.fit(x_train_vec, y_train)

In [None]:
pred_values_svm_train = svm_clf.predict(x_train_vec)
pred_values_svm_test = svm_clf.predict(x_test_vec)

In [None]:
print(f"SVM performance on train data -> \n{clfr(y_train,pred_values_svm_train)}")
print("\n")
print(f"SVM performance on test data -> \n{clfr(y_test,pred_values_svm_test)}")

SVM performance on train data -> 
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      6700
           1       0.95      0.94      0.94      7300

    accuracy                           0.94     14000
   macro avg       0.94      0.94      0.94     14000
weighted avg       0.94      0.94      0.94     14000



SVM performance on test data -> 
              precision    recall  f1-score   support

           0       0.15      0.11      0.13      1056
           1       0.23      0.30      0.26       944

    accuracy                           0.20      2000
   macro avg       0.19      0.20      0.19      2000
weighted avg       0.19      0.20      0.19      2000

