### Import sections

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import pickle 
import numpy as np
import time

#### Reading the Data 

In [2]:
fake_data = pd.read_csv('https://s3.amazonaws.com/assets.datacamp.com/production/course_3629/fake_or_real_news.csv',
            low_memory=True,
            nrows = 100
           )

In [3]:
fake_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
response = fake_data.label

In [5]:
count_vectorizer = CountVectorizer(stop_words="english",
                                   ngram_range=(1,4))

#### Train test split

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(fake_data["text"],
                                                   response,
                                                   test_size=0.33,
                                                   random_state=53)

In [7]:
X_train[1]

'Google Pinterest Digg Linkedin Reddit Stumbleupon Print Delicious Pocket Tumblr \nThere are two fundamental truths in this world: Paul Ryan desperately wants to be president. And Paul Ryan will never be president. Today proved it. \nIn a particularly staggering example of political cowardice, Paul Ryan re-re-re-reversed course and announced that he was back on the Trump Train after all. This was an aboutface from where he was a few weeks ago. He had previously declared he would not be supporting or defending Trump after a tape was made public in which Trump bragged about assaulting women. Suddenly, Ryan was appearing at a pro-Trump rally and boldly declaring that he already sent in his vote to make him President of the United States. It was a surreal moment. The figurehead of the Republican Party dosed himself in gasoline, got up on a stage on a chilly afternoon in Wisconsin, and lit a match. . @SpeakerRyan says he voted for @realDonaldTrump : “Republicans, it is time to come home” ht

In [8]:
count_train = count_vectorizer.fit_transform(X_train)

In [9]:
count_train

<67x83297 sparse matrix of type '<class 'numpy.int64'>'
	with 94064 stored elements in Compressed Sparse Row format>

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
count_test

#### Naiive Bayes Training

In [None]:
nb_classifier = MultinomialNB()

In [None]:
nb_classifier.fit(count_train, Y_train)

In [None]:
pred = nb_classifier.predict(count_test)

In [None]:
score = metrics.accuracy_score(Y_test, pred)

In [None]:
score

In [None]:
cm = metrics.confusion_matrix(Y_test,
                              pred,
                              labels=["FAKE",  "REAL"]
                             )

In [None]:
cm

In [None]:
pickle.dump(nb_classifier, open('naiiveBayes.pkl', 'wb'))

In [None]:
vect = CountVectorizer(stop_words="english",
                                   ngram_range=(1,4))

#### Streaming

In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc
import pickle 
import time
from sklearn.naive_bayes import MultinomialNB

#### Load the model

In [2]:
model = pickle.load(open('../models/naiiveBayes.pkl', 'rb'))
count_vectorizer = pickle.load(open('../models/count_vector.pkl', 'rb'))

#### Set spark configurations

In [3]:
conf = SparkConf()
conf.set("spark.app.name", "MyApp")
conf.set("spark.ui.port", "36000")

<pyspark.conf.SparkConf at 0x10fddd5c0>

#### Create spark context

In [4]:
sc = SparkContext("local[2]", "Simple App", conf)

In [5]:
ssc = StreamingContext(sc, 10)

In [6]:
tweets = ssc.socketTextStream("127.0.0.1", 5555)

#### Do prediction on stream of tweets 

In [7]:
transformed = tweets.map(lambda x : (x, model.predict(count_vectorizer.transform(pd.Series(x)))))

In [8]:
transformed.pprint()
tweets.pprint()

#### Start of the stream

In [9]:
ssc.start()
ssc.awaitTermination()

KeyboardInterrupt: 