In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.svm import LinearSVC #a classifier that works best for text data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data = pd.read_csv("news_dataset.csv")

In [None]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]   

In [None]:
data['label'].dropna(inplace=True)
data['text'] = data['text'].astype(str)
data['tokenized'] = data['text'].apply(lambda x: tokenize(x))

In [None]:
def punctuation_to_features(df, column):
    
    df[column] = df[column].replace('!', ' exclamation ')
    df[column] = df[column].replace('?', ' question ')
    df[column] = df[column].replace('\'', ' quotation ')
    df[column] = df[column].replace('\"', ' quotation ')
    
    return df[column]

In [None]:
data['text'] = punctuation_to_features(data, 'text')

In [None]:
def remove_stopwords(tokenized_column):
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

In [None]:
data['stopwords_removed'] = data.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)

In [None]:
from nltk.stem.porter import PorterStemmer

def apply_stemming(tokenized_column):
    
    stemmer = PorterStemmer() 
    return [stemmer.stem(word).lower() for word in tokenized_column]

In [None]:
data['porter_stemmed'] = data['stopwords_removed'].apply(lambda x: apply_stemming(x))

In [None]:
def rejoin_words(tokenized_column):
    return ( " ".join(tokenized_column))

In [None]:
data['text'] = data['porter_stemmed'].apply(lambda x: rejoin_words(x))

In [None]:
#now to encode it into a binary feature

data['fake'] = data['label'].apply(lambda x: 0 if x == "REAL" else 1)

In [None]:
x, y = data['text'], data['fake'] 

In [None]:
x

0       payal accus filmmak anurag kashyap behav inapp...
1       a video woman criticis govern amend act ralli ...
2       republ poll fake twitter account imit arnab re...
3       delhi teen find place un green list turn glass...
4       delhi a meet underway resid rajya sabha chairm...
                              ...                        
3724    ist sep the second round countrywid serosurvey...
3725    ist sep the second round countrywid serosurvey...
3726    the bengaluru citi polic offici twitter handl ...
3727    sep ist sourc meet neelkantha bhanu prakash wo...
3728    read also read also advoc ishkaran bhandari re...
Name: text, Length: 3729, dtype: object

In [None]:
y

0       0
1       1
2       1
3       0
4       0
       ..
3724    0
3725    0
3726    1
3727    0
3728    0
Name: fake, Length: 3729, dtype: int64

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #20% of the data should be used for evaluation and 80% for training

In [None]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
x_train_vectorized = vectorizer.fit_transform(x_train.astype('U'))
x_test_vectorized = vectorizer.transform(x_test.astype('U'))

In [None]:
clf = LinearSVC()
clf.fit(x_train_vectorized, y_train)

In [None]:
clf.score(x_test_vectorized, y_test) #so we get a 99.7% accuracy on the testing set

0.9973190348525469

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(force_alpha=True)
clf.fit(x_train_vectorized, y_train)

In [None]:
clf.score(x_test_vectorized, y_test) #so we get a 95.8% accuracy on the testing set

0.9584450402144772

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(x_train_vectorized, y_train)

In [None]:
clf.score(x_test_vectorized, y_test) #so we get a 98.7% accuracy on the testing set

0.9879356568364611