## Imports

In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.4-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.4


In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as clfr

In [None]:
!gdown 1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS
!gdown 1_3e8jv8uG4zRkHEimaH1VwHxEc_5k2JX

Downloading...
From: https://drive.google.com/uc?id=1Nk7eeRzyIAzqkdviopWIy72XGVlu7MTS
To: /content/WELFake_Dataset.csv
100% 245M/245M [00:02<00:00, 100MB/s]
Downloading...
From: https://drive.google.com/uc?id=1pFFcunSiNS6PCGd9c_MvvPukJqZp1lHs
To: /content/fake_or_real_news.csv
100% 30.7M/30.7M [00:00<00:00, 90.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_3e8jv8uG4zRkHEimaH1VwHxEc_5k2JX
To: /content/data.csv
100% 12.6M/12.6M [00:00<00:00, 29.7MB/s]


### Read preprocessed data

In [None]:
data2 = pd.read_csv("/content/data.csv")
data = pd.read_csv("/content/WELFake_Dataset.csv")

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
data2.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [None]:
data = data.dropna()
data2 = data2.dropna()

data = data[:14000]
data2 = data2[:2000]

In [None]:
x_train = data['text']
y_train = data['label']

x_test = data2['Body']
y_test = data2['Label']

### Train Test split (80:20)

In [None]:
label_train= np.unique(y_train,return_counts=True)
label_test= np.unique(y_test,return_counts=True)
print(f"Train distribution {label_train}, Test Distribuition {label_test}")

Train distribution (array([0, 1]), array([6700, 7300])), Test Distribuition (array([0, 1]), array([1056,  944]))


## Setting up tfidf vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_obj = TfidfVectorizer().fit(x_train)

x_train_vec = tfidf_obj.transform(x_train)
x_test_vec = tfidf_obj.transform(x_test)

In [None]:
svd = TruncatedSVD(n_components=20)
svd.fit(x_train_vec)

In [None]:
x_train_vec = svd.transform(x_train_vec)
x_test_vec = svd.transform(x_test_vec)

##ML models


### logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression as LGR

In [None]:
lr_clf = LGR(random_state=0).fit(x_train_vec,y_train)

In [None]:
pred_values_lr_train = lr_clf.predict(x_train_vec)
pred_values_lr_test = lr_clf.predict(x_test_vec)

In [None]:

print(f" Logistic Regression performance on train data -> \n{clfr(y_train,pred_values_lr_train)}")

print("\n\n")

print(f" Logistic Regression performance on test data -> \n{clfr(y_test,pred_values_lr_test)}")

 Logistic Regression performance on train data -> 
              precision    recall  f1-score   support

           0       0.87      0.82      0.85      6700
           1       0.85      0.89      0.87      7300

    accuracy                           0.86     14000
   macro avg       0.86      0.85      0.86     14000
weighted avg       0.86      0.86      0.86     14000




 Logistic Regression performance on test data -> 
              precision    recall  f1-score   support

           0       0.10      0.06      0.07      1056
           1       0.28      0.41      0.33       944

    accuracy                           0.22      2000
   macro avg       0.19      0.23      0.20      2000
weighted avg       0.18      0.22      0.19      2000



### K-Nearest Neighbors

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train_vec,y_train)

In [None]:
pred_values_knn_train = knn_clf.predict(x_train_vec)
pred_values_knn_test = knn_clf.predict(x_test_vec)

In [None]:
print(f" KNN performance on train data -> \n{clfr(y_train,pred_values_knn_train)}")
print("\n\n")
print(f" KNN Regression performance on test data -> \n{clfr(y_test,pred_values_knn_test)}")

 KNN performance on train data -> 
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      6700
           1       0.90      0.93      0.92      7300

    accuracy                           0.91     14000
   macro avg       0.91      0.91      0.91     14000
weighted avg       0.91      0.91      0.91     14000




 KNN Regression performance on test data -> 
              precision    recall  f1-score   support

           0       0.23      0.18      0.20      1056
           1       0.27      0.34      0.30       944

    accuracy                           0.25      2000
   macro avg       0.25      0.26      0.25      2000
weighted avg       0.25      0.25      0.25      2000



### Decision Tree

In [None]:
dec_clf = DecisionTreeClassifier(random_state=0)
dec_clf.fit(x_train_vec,y_train)

In [None]:
pred_values_dec_train = dec_clf.predict(x_train_vec)
pred_values_dec_test = dec_clf.predict(x_test_vec)

In [None]:
print(f" Decision tree performance on train data -> \n{clfr(y_train,pred_values_dec_train)}")
print("\n\n")
print(f" Decision performance on test data -> \n{clfr(y_test,pred_values_dec_test)}")

 Decision tree performance on train data -> 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6700
           1       1.00      1.00      1.00      7300

    accuracy                           1.00     14000
   macro avg       1.00      1.00      1.00     14000
weighted avg       1.00      1.00      1.00     14000




 Decision performance on test data -> 
              precision    recall  f1-score   support

           0       0.23      0.16      0.19      1056
           1       0.29      0.38      0.33       944

    accuracy                           0.27      2000
   macro avg       0.26      0.27      0.26      2000
weighted avg       0.26      0.27      0.25      2000



### SVM

In [None]:
svm_clf = SVC(C=0.8)
svm_clf.fit(x_train_vec, y_train)

In [None]:
pred_values_svm_train = svm_clf.predict(x_train_vec)
pred_values_svm_test = svm_clf.predict(x_test_vec)

In [None]:
print(f"SVM performance on train data -> \n{clfr(y_train,pred_values_svm_train)}")
print("\n\n")
print(f"SVM performance on test data -> \n{clfr(y_test,pred_values_svm_test)}")

SVM performance on train data -> 
              precision    recall  f1-score   support

           0       0.91      0.85      0.88      6700
           1       0.87      0.92      0.89      7300

    accuracy                           0.89     14000
   macro avg       0.89      0.88      0.89     14000
weighted avg       0.89      0.89      0.89     14000




SVM performance on test data -> 
              precision    recall  f1-score   support

           0       0.09      0.05      0.07      1056
           1       0.26      0.37      0.31       944

    accuracy                           0.21      2000
   macro avg       0.18      0.21      0.19      2000
weighted avg       0.17      0.21      0.18      2000

