In [None]:
import pandas as pd
import numpy as np
import pika
import json
import redis
import re
import unicodedata
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import scale, normalize
from sklearn.preprocessing import Imputer
from sklearn import metrics
import matplotlib.pyplot as plt
import random
import tweep
from kafka import KafkaProducer

In [None]:
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_LIST = '<your_relis_list_name>'
counter = 0

# Try to connect to the RabbitMQ server
try:
    connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
    channel = connection.channel()

    # Create a queue called <your_queue_name>
    channel.queue_declare(queue='<your_queue_name>')

except Exception as err:
    print (err)

# try to connect to the redis server
try:
    r = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=0)
except Exception as err:
    print (err)


def callback(ch, method, properties, body):
    global counter
    print(body.decode())

    try:
        r.lpush(REDIS_LIST, body.decode())
        counter += 1
    except:
        print ('Problem adding data to Redis.')

    # stop after 1000 tweets (optional - can be removed)
    if counter == 1000:
        channel.close()
        connection.close()


channel.basic_consume('<your_queue_name>', callback, True)

print(' [*] Waiting for messages. To exit press CTRL+C')

channel.start_consuming()

In [None]:
#take redis entries
result = r.lrange(REDIS_LIST, 0, -1)
p = re.compile(r'@([^\s:]+)')

In [None]:
#decode and remove unicode characters
newList = []
for i in result:
    i = i.decode("utf-8")
    if "\\" not in i:
        newList.append(i)
        
result = newList

In [None]:
#convert redis entries to dataframe
df = pd.DataFrame(columns = ['Tweets', 'User', 'User_statuses_count', 
                             'user_followers', 'User_location', 'User_verified',
                             'fav_count', 'rt_count', 'tweet_date'])

index = 0
for x in result:
    d = json.loads(x)
    df.loc[index, 'Tweets'] = d.get('text')
    df.loc[index, 'User'] = d.get('user')
    df.loc[index, 'User_statuses_count'] = d.get('user_statuses_count')
    df.loc[index, 'user_followers'] = d.get('user_followers')
    df.loc[index, 'user_friends'] = d.get('user_friends')
    df.loc[index, 'listed_count'] = d.get('listed_count')
    df.loc[index, 'User_location'] = d.get('user_location')
    df.loc[index, 'User_verified'] = d.get('user_verified')
    df.loc[index, 'fav_count'] = d.get('fav_count')
    df.loc[index, 'rt_count'] = d.get('rt_count')
    df.loc[index, 'tweet_date'] = d.get('tweet_date')
    index += 1
    
df['fav_count'].replace('None', 0, inplace=True)    
df['rt_count'].replace('None', 0, inplace=True) 
df = df.fillna(0)

In [None]:
#clean greek tweets - Greek Stemmer (https://github.com/DimitrisCC/GrPolitics_Twitter_SentAnalysis/blob/5c165306f3cb00d001942013fd252589614a13f9/preprocessing.py)

class GreekAnalyzer:
    one_suff = ('Α', 'Ο', 'Ε', 'Η', 'Ω', 'Υ', 'Ι')
    three_suff = ('ΟΥΣ', 'ΕΙΣ', 'ΕΩΝ', 'ΟΥΝ')
    two_suff = ('ΟΣ', 'ΗΣ', 'ΕΣ', 'ΩΝ', 'ΟΥ', 'ΟΙ', 'ΑΣ', 'ΩΣ', 'ΑΙ', 'ΥΣ', 'ΟΝ', 'ΑΝ', 'ΕΙ')

    class Sentence:
        # This class represents a string which will be cleaned as part of a pre-processing procedure
        def __init__(self, sentence):
            self.sentence = str(sentence).upper()

        def __repr__(self):
            return str(self.sentence)

        # Default argument values are evaluated at function define-time,
        # but self is an argument only available at function call time.
        # Thus arguments in the argument list cannot refer each other.

        def strip_accents(self, sentence=None):
            if sentence is None:
                sentence = self.sentence
            return GreekAnalyzer.Sentence(''.join(c for c in unicodedata.normalize('NFD', sentence)
                                                  if unicodedata.category(c) != 'Mn'))

        def strip_specialcharacters_numbers(self, sentence=None):
            if sentence is None:
                sentence = self.sentence
            return GreekAnalyzer.Sentence(re.sub(r'[^Α-Ωα-ω ]', '', sentence, flags=re.MULTILINE))

        def strip_links(self, sentence=None):
            if sentence is None:
                sentence = self.sentence
            return GreekAnalyzer.Sentence(re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE))

        def strip_tags(self, sentence=None):
            if sentence is None:
                sentence = self.sentence
            return GreekAnalyzer.Sentence(re.sub(r'#\w*|@\w*', '', sentence, flags=re.MULTILINE))

        def stem(self, sentence=None):
            if sentence is None:
                sentence = self.sentence
            stemmed = ""
            for term in sentence.split():
                # Check if term is numeric
                pattern = re.compile("^[+-]?(\\d+(\\.\\d*)?|\\.\\d+)([eE][+-]?\\d+)?$")
                if pattern.match(term):
                    return ''
                # Remove first level suffixes only if the term is 4 letters or more
                if len(term) >= 4:
                    # Remove the 3 letter suffixes
                    if term.endswith(GreekAnalyzer.three_suff):
                        term = term[:-3]
                        # Remove the 2 letter suffixes
                    elif term.endswith(GreekAnalyzer.two_suff):
                        term = term[:-2]
                    # Remove the 1 letter suffixes
                    elif term.endswith(GreekAnalyzer.one_suff):
                        term = term[:-1]
                stemmed += term + ' '
            return GreekAnalyzer.Sentence(stemmed[:-1])

        def strip_stopwords(self, sentence=None, stop_words=None):
            if sentence is None:
                sentence = self.sentence
            if stop_words is None:
                return GreekAnalyzer.Sentence(sentence)
            for w in stop_words:
                sentence = re.sub(r'\b'+w+r'\b', '', sentence)
            return GreekAnalyzer.Sentence(sentence)

    def __init__(self, sentence):
        if isinstance(sentence, GreekAnalyzer.Sentence):
            self.sentence = sentence
        else:
            self.sentence = GreekAnalyzer.Sentence(sentence)

    def clean(self, sentence=None, stop_words=None):
        if sentence is None:
            sentence = self.sentence
        if isinstance(sentence, GreekAnalyzer.Sentence):
            return str(sentence
                       .strip_accents()
                       .strip_links()
                       .strip_tags()
                       .strip_specialcharacters_numbers()
                       .strip_stopwords(stop_words=stop_words).stem()
                       )
        else:
            return GreekAnalyzer(GreekAnalyzer.Sentence(sentence)).clean(stop_words)


#Loading stopwords
fstopwords = open('greekstopwords.txt', 'rt', encoding="utf8")

stopwords = [w.strip() for w in fstopwords.readlines() if w.strip() != '']
del (stopwords[0])  

fstopwords.close()


def clean_tweets(tweets: dict):
    proc = []
    for text in tweets['text']:
        analyzer = GreekAnalyzer(text)
        proc.append(analyzer.clean(stop_words=stopwords))
    tweets['clean_text'] = proc
    return tweets


def format_time(time):
    strtime = str(time)
    digits = len(strtime)
    if digits == 1:
        return "0"+strtime
    else:
        return strtime
    
for i in range(len(df)) :
    analyzer = GreekAnalyzer(df.iloc[i,0])
    df.iloc[i,0] = analyzer.clean(stop_words=stopwords)

In [None]:
# Step 2 Fake account detection

train_data = pd.read_csv('kaggle_train.csv')
test_data = df

train_attr = train_data[
  ['followers_count', 'friends_count', 'listedcount', 'favourites_count', 'statuses_count', 'verified']]
train_label = train_data[['bot']]

test_attr = test_data[
  ['user_followers', 'user_friends', 'listed_count', 'fav_count', 'User_statuses_count', 'User_verified']]
test_attr.columns = ['followers_count','friends_count','listedcount','favourites_count','statuses_count','verified']

train_attr = normalize(train_attr)
test_attr = normalize(test_attr)

In [None]:
nb = GaussianNB().fit(train_attr, train_label.as_matrix())
predicted = nb.predict(test_attr)
pred = np.array(predicted)

test_data["pred"] = pred

count1 = test_data[test_data["pred"]==1].count()["pred"]
print("Real Accounts : ",count1) 
count0 = test_data[test_data["pred"]==0].count()["pred"]
print("Fake accounts : ",count0) 

In [None]:
#keep only the real users
test_data = test_data[test_data["pred"]==1]

In [None]:
producer = KafkaProducer(bootstrap_servers='localhost:9092', value_serializer=lambda v: json.dumps(v).encode('utf-8'))

tweets = test_data["Tweets"]

#send tweets to Spark
for i in tweets:
    print(i)
    producer.send("<your_kafka_topic_name>", i)
    producer.flush()

print("-------------All tweets are headed to spark streaming------------------")