In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.svm import LinearSVC #a classifier that works best for text data

'''
TfidfVectorizer takes two matrices TF (Term Frequency) and IDF (Inverse Document Frequency)

idf => metric calculated by logarithm and division
divide the number of documents divided by the number of the documents that contain the term

then we multiply the two matrices to get a score. to get the the most important and distinctive terms of an article.
'''

'\nTfidfVectorizer takes two matrices TF (Term Frequency) and IDF (Inverse Document Frequency)\n\nidf => metric calculated by logarithm and division\ndivide the number of documents divided by the number of the documents that contain the term\n\nthen we multiply the two matrices to get a score. to get the the most important and distinctive terms of an article.\n'

In [None]:
data = pd.read_csv("news_dataset.csv")

In [None]:
data

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...
...,...,...
3724,REAL,19:17 (IST) Sep 20\n\nThe second round of coun...
3725,REAL,19:17 (IST) Sep 20\n\nThe second round of coun...
3726,FAKE,The Bengaluru City Police’s official Twitter h...
3727,REAL,"Sep 20, 2020, 08:00AM IST\n\nSource: TOI.in\n\..."


In [None]:
#now to encode it into a binary feature

data['fake'] = data['label'].apply(lambda x: 0 if x == "REAL" else 1)

In [None]:
data

Unnamed: 0,label,text,fake
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...,0
1,FAKE,A four-minute-long video of a woman criticisin...,1
2,FAKE,"Republic Poll, a fake Twitter account imitatin...",1
3,REAL,"Delhi teen finds place on UN green list, turns...",0
4,REAL,Delhi: A high-level meeting underway at reside...,0
...,...,...,...
3724,REAL,19:17 (IST) Sep 20\n\nThe second round of coun...,0
3725,REAL,19:17 (IST) Sep 20\n\nThe second round of coun...,0
3726,FAKE,The Bengaluru City Police’s official Twitter h...,1
3727,REAL,"Sep 20, 2020, 08:00AM IST\n\nSource: TOI.in\n\...",0


In [None]:
x, y = data['text'], data['fake'] 

In [None]:
x

0       Payal has accused filmmaker Anurag Kashyap of ...
1       A four-minute-long video of a woman criticisin...
2       Republic Poll, a fake Twitter account imitatin...
3       Delhi teen finds place on UN green list, turns...
4       Delhi: A high-level meeting underway at reside...
                              ...                        
3724    19:17 (IST) Sep 20\n\nThe second round of coun...
3725    19:17 (IST) Sep 20\n\nThe second round of coun...
3726    The Bengaluru City Police’s official Twitter h...
3727    Sep 20, 2020, 08:00AM IST\n\nSource: TOI.in\n\...
3728    Read Also\n\nRead Also\n\nAdvocate Ishkaran Bh...
Name: text, Length: 3729, dtype: object

In [None]:
y

0       0
1       1
2       1
3       0
4       0
       ..
3724    0
3725    0
3726    1
3727    0
3728    0
Name: fake, Length: 3729, dtype: int64

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #20% of the data should be used for evaluation and 80% for training

In [None]:
len(x_train) #training data

2983

In [None]:
len(x_test) #testing data

746

In [None]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
x_train_vectorized = vectorizer.fit_transform(x_train.astype('U'))
x_test_vectorized = vectorizer.transform(x_test.astype('U'))

In [None]:
x_train_vectorized

NameError: ignored

In [None]:
clf = LinearSVC()
clf.fit(x_train_vectorized, y_train)

In [None]:
clf.score(x_test_vectorized, y_test) #so we get a 99.4% accuracy on the testing set

0.9946380697050938

In [None]:
len(y_test)  * 0.9946 #out of 746, 741 were labelled correctly

741.9716000000001

In [None]:
with open("mynews.txt", "r", encoding="utf-8") as f:
  text = f.read()

In [None]:
vectorized_text = vectorizer.transform([text])

In [None]:
clf.predict(vectorized_text)

array([1])