In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
fake_news=pd.read_csv('Fake.csv')

In [None]:
real_news=pd.read_csv('True.csv')

In [None]:
fake_news.head()

In [None]:
fake_news.columns


In [None]:
real_news.head()

In [None]:
fake_news['class']='Fake'
real_news['class']='Real'

In [None]:
fake_news.head()

In [None]:
real_news.head()

In [None]:
fake_news['subject'].value_counts()

In [None]:
real_news['subject'].value_counts()

In [None]:
fake_news['date'].value_counts()

In [None]:
# we are going to deal with the 'title' and 'text' columns in this project,
# so we are going to drop the columns 'subject' and 'date'
fake_news.drop(['subject','date'],axis=1,inplace=True)
real_news.drop(['subject','date'],axis=1,inplace=True)


In [None]:
news=pd.concat([fake_news,real_news],ignore_index=True,sort=False)
news=news.sample(frac=1).reset_index(drop=True)
news

In [None]:
news['text']=news['title']+news['text']
news.drop('title',axis=1,inplace=True)
news.head()

In [None]:
news.describe()

In [None]:
news.groupby('class').describe().transpose()

In [None]:
news['length']=news['text'].apply(len)
news.head()

In [None]:
#EDA

news['length'].plot(bins=100, kind='hist',figsize=(14,7),)

In [None]:
news.length.describe()

In [None]:
news[news['length'] == 51892]['text'].iloc[0]

In [None]:
news[news['length'] == 51892]

In [None]:
news.hist(column='length', by='class', bins=70,figsize=(12,7))

In [None]:
#Text Pre-Processing
import string

In [None]:
import nltk

In [None]:
nltk.download_shell()

In [None]:
from nltk.corpus import stopwords

In [None]:
stopwords.words('english')[0:10]

In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
news.head()

In [None]:
news['text'].head(5).apply(text_process)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer


In [None]:
from sklearn.model_selection import train_test_split

nws_train, nws_test, class_train, class_test = \
train_test_split(news['text'], news['class'], test_size=0.3)

print(len(nws_train), len(nws_test), len(nws_train) + len(nws_test))
    
    

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
pipeline.fit(nws_train,class_train)

In [None]:
predictionsNB = pipeline.predict(nws_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(predictionsNB,class_test))

In [None]:
print(confusion_matrix(predictionsNB,class_test))

In [None]:
from sklearn.svm import SVC

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', SVC()),  # train on TF-IDF vectors w/ SVM classifier
])

In [None]:
pipeline.fit(nws_train,class_train)

In [None]:
predictionsSVM = pipeline.predict(nws_test)

In [None]:
print(classification_report(predictionsSVM,class_test))

In [None]:
print(confusion_matrix(predictionsSVM,class_test))

In [None]:

param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', GridSearchCV(SVC(),param_grid,refit=True,verbose=3)),  # train on TF-IDF vectors w/ GridS classifier
])

In [None]:
pipeline.fit(nws_train,class_train)

In [None]:
grid_predictions = pipeline.predict(nws_test)

In [None]:
print(confusion_matrix(class_test,grid_predictions))

In [None]:
print(classification_report(class_test,grid_predictions))