# Proyecto Final - Código: Deteccion_de_Bots_en_Twitter_mediante_grafos_y_Machine_Learning

## 1. API Twitter - Acceso a usuarios, tweets

Claves recogidas de la API de Twitter: https://developer.twitter.com/en/docs

In [1]:
import tweepy
import json

consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)
# api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

Acceso a tweets públicos:

In [None]:
public_tweets = api.home_timeline()
for tweet in public_tweets:
    print(tweet.text)

Acceso a la información en formato json de un usuario, por ejemplo @nike:

In [None]:
# data = api.me()
# print(json.dumps(data._json, indent=2))

data = api.get_user("nike")
print(json.dumps(data._json, indent=2))

Obtener followers de un usuario:

In [None]:
#data = api.followers(screen_name="nike")
#for user in data:
#    print(json.dumps(user._json, indent=2))

# La API de Twitter solo entrega información paginada o en grupo (20 resultados)
# Para obtenerlos todos hay que utilizar la clase Cursor, especificando usuario y nº de followers a obtener:

for user in tweepy.Cursor(api.followers, screen_name="nike").items(50):
    print(json.dumps(user._json, indent=2))

Obtener followees o friends (usuarios a los que sigue nuestra cuenta objetivo) con Cursor:

In [None]:
for user in tweepy.Cursor(api.friends, screen_name="nike").items(50):
    print(json.dumps(user._json, indent=2))

Obtener timeline de tweets de un usuario; se espeficica cuántos con Cursor:

In [None]:
for tweet in tweepy.Cursor(api.user_timeline, screen_name="nike", tweet_mode="extended").items(2):
    print(json.dumps(tweet._json, indent=2))

Buscar tweets que contengan una cadena (q="algo"):

In [None]:
for tweet in tweepy.Cursor(api.search, q="algo", tweet_mode="extended").items(10):
    #print(json.dumps(tweet._json, indent=2))
    print(tweet._json["full_text"])

## 2. Funciones Auxiliares

### 2.1 Funciones para obtener ficheros json:

Se dispone de varias bases de datos, como dev.json, que dispone de 2365 entradas. Se quieren obtener los screen_name en un txt: "screen_names.txt"

In [None]:
import json

with open("Twitbot20/dev.json") as json_file:
    data = json.load(json_file)
    #screen_names = []
    f = open("screen_names.txt", "w")
    for user in data:
        f.write(user['profile']['screen_name'] +"\n")
    f.close()
    json_file.close()

Se recogen en un json (SNlabels.json) los screen_name y label (1 si es bot, 0 si es humano):

In [None]:
import json

list=[]
with open("Twitbot20/dev.json") as json_file:
    data = json.load(json_file)
    for user in data:
        list.append({'screen_name':user['profile']['screen_name'],'label':user['label']})
    json_file.close()

f = open("SNlabels.json", "w")
f.write(json.dumps(list))
f.close()

In [56]:
with open("SNlabels.json", "r") as json_file:
    data = json.load(json_file)

    print(len(data))
json_file.close()

2365


### 2.2 Saneamiento de los ficheros

Se obtiene un json con las cuentas que NO han sido eliminadas: "SNlabelsSANEADO.json"

In [None]:
import json
import time

list=[]

with open("SNlabels.json", "r") as json_file:
    data = json.load(json_file)
    for user in data:
        # sleep para no saturar la API: rate_time_limit
        time.sleep(1)
        try:
            api.get_user(user['screen_name'])
            list.append({'screen_name':user['screen_name'],'label':user['label']})
            print(user['screen_name'], user['label'])
        except:
            continue
            
json_file.close()

f = open("SNlabelsSANEADO.json", "w")
f.write(json.dumps(list))
f.close()

In [70]:
with open("SNlabelsSANEADO.json", "r") as json_file:
    data = json.load(json_file)

    print(len(data))
json_file.close()

2196


Se obtienen un total de 2196 que no han sido eliminadas, entre las que hay bots y humanos. Todavía hay cuentas que no devolverán resultados (grafos dirigidos) porque llevan inactivas cierto tiempo. Estos casos se eliminarán sobre la marcha.

### 2.3 Clases adicionales

In [30]:
# https://github.com/Gabrieliam/Twitter-graph-classification
# https://towardsdatascience.com/python-detecting-twitter-bots-with-graphs-and-machine-learning-41269205ab07
# https://botometer.osome.iu.edu/bot-repository/datasets.html

import tweepy
import pandas as pd
import numpy as np
import ast
import csv
import sys
import igraph
import json
import operator
import re
import matplotlib.pyplot as plt
import xgboost as xgb
import networkx as nx
from karateclub import Graph2Vec

https://gist.github.com/jdmoore7/51c048195b4fa1ddbd2ad3c56598a886
class TweetGrabber():

In [None]:
#https://gist.github.com/jdmoore7/51c048195b4fa1ddbd2ad3c56598a886
class TweetGrabber():
    
    def __init__(self,myApi,sApi,at,sAt):
        import tweepy
        self.tweepy = tweepy
        auth = tweepy.OAuthHandler(myApi, sApi)
        auth.set_access_token(at, sAt)
        self.api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)        
        
    def strip_non_ascii(self,string):
        ''' Returns the string without non ASCII characters'''
        stripped = (c for c in string if 0 < ord(c) < 127)
        return ''.join(stripped)
        
    def keyword_search(self,keyword,csv_prefix):
        import csv        
        API_results = self.api.search(q=keyword,rpp=1000,show_user=True,tweet_mode='extended')

        with open(f'{csv_prefix}.csv', 'w', newline='') as csvfile:
            fieldnames = ['tweet_id', 'tweet_text', 'date', 'user_id', 'follower_count',
                          'retweet_count','user_mentions']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for tweet in API_results:
                text = self.strip_non_ascii(tweet.full_text)
                date = tweet.created_at.strftime('%m/%d/%Y')        
                writer.writerow({
                                'tweet_id': tweet.id_str,
                                'tweet_text': text,
                                'date': date,
                                'user_id': tweet.user.id_str,
                                'follower_count': tweet.user.followers_count,
                                'retweet_count': tweet.retweet_count,
                                'user_mentions':tweet.entities['user_mentions']
                                })        
        
    def user_search(self,user,csv_prefix):
        import csv
        API_results = self.tweepy.Cursor(self.api.user_timeline,id=user,tweet_mode='extended').items()

        with open(f'{csv_prefix}.csv', 'w', newline='') as csvfile:
            fieldnames = ['tweet_id', 'tweet_text', 'date', 'user_id', 'user_mentions', 'retweet_count']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for tweet in API_results:
                text = self.strip_non_ascii(tweet.full_text)
                date = tweet.created_at.strftime('%m/%d/%Y')        
                writer.writerow({
                                'tweet_id': tweet.id_str,
                                'tweet_text': text,
                                'date': date,
                                'user_id': tweet.user.id_str,
                                'user_mentions':tweet.entities['user_mentions'],
                                'retweet_count': tweet.retweet_count
                                }) 

https://gist.github.com/jdmoore7/f062916705494b51d8d625a910e9ea81
class RetweetParser():

In [None]:
class RetweetParser():

    def __init__(self,data,user):
        import ast
        self.user = user

        edge_list = []

        for idx,row in data.iterrows():
            if len(row[4]) > 5:    
                user_account = user
                weight = np.log(row[5] + 1)
                for idx_1, item in enumerate(ast.literal_eval(row[4])):
                    edge_list.append((user_account,item['screen_name'],weight))

                    for idx_2 in range(idx_1+1,len(ast.literal_eval(row[4]))):
                        name_a = ast.literal_eval(row[4])[idx_1]['screen_name']
                        name_b = ast.literal_eval(row[4])[idx_2]['screen_name']
                        edge_list.append((name_a,name_b,weight))
        import csv
        with open(f'{self.user}.csv', 'w', newline='') as csvfile:
            fieldnames = ['user_a', 'user_b', 'log_retweet']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for row in edge_list:        
                writer.writerow({
                                'user_a': row[0],
                                'user_b': row[1],
                                'log_retweet': row[2]
                                })

https://gist.github.com/jdmoore7/074bc5adec23d2291cab945ab62c8a01
class TweetGraph():

In [None]:
class TweetGraph():
    def __init__(self,edge_list):
        import igraph
        import pandas as pd
        data = pd.read_csv(edge_list).to_records(index=False)
        self.tuple_graph = igraph.Graph.TupleList(data, weights=True, directed=False)

    def e_centrality(self):
        import operator
        vectors = self.tuple_graph.eigenvector_centrality()
        e = {name:cen for cen, name in  zip([v for v in vectors],self.tuple_graph.vs['name'])}
        return sorted(e.items(), key=operator.itemgetter(1),reverse=True)

## 3. Programa principal

In [None]:
import os

# Se crea un objeto de tipo TweetGrabber

t = TweetGrabber(
    #consumer_key = "",
    #consumer_secret = "",
    #access_token = "",
    #access_token_secret = "")
    
    myApi = "",
    sApi = "",
    at = "",
    sAt = "")

# Variable to hold whatever Twitter user is being classified
screen_names = []
# Se podría implementar la entrada de datos:
# screen_name = sys.argv[1]

# Se crea un objeto dataframe
import pandas as pd
df = pd.read_json(r'SNlabelsSANEADO.json')

for i,j in df.iterrows():
    screen_names.append(j['screen_name'])
    
# screen_names tiene los @user del json saneado

# se mantiene constancia del índice para eliminar la fila que no devuelva info
index = 0
for screen_name in screen_names: 
    try:
        existing_gml = igraph.read(screen_name + ".gml")
        print(screen_name + ".gml already exists.")

    except FileNotFoundError:

        try:
            
            print("*Trabajando en " + screen_name + "*")
            # Se almacenan en un screen_name.csv las menciones con ese usuario
            t.user_search(user=screen_name, csv_prefix=screen_name)

            # Se crea un DataFrame a partir del csv para alimentar a RetweetParser
            userFrame = pd.read_csv(screen_name + ".csv")
            
            # RetweetParser sobreescribe el csv con un grafo conderado (lista con nodos y vectores no dirigidos)
            r = RetweetParser(userFrame, screen_name)

            # Se crea un objeto iGraph ponderado no dirigido
            log_graph = TweetGraph(edge_list=screen_name + ".csv")
            
            # Se comprueba que el csv contiene información (la cuenta interactúa), si no es eliminado:
            f = open(screen_name + ".csv")
            reader = csv.reader(f)
            lines= len(list(reader))
            f.close()
            
            if lines<2:
                df.drop([index])
                os.remove(screen_name + ".csv")
                print("***Se salta el user " + screen_name + "***")
                continue
            
            


            #Add 'size' attribute to each vertex based on its Eigencentrality
            #NOTE: multiplying the value by some consistent large number creates a more intuitive
            #plot, viewing-wise, but doesn't impact classification, since this change is applied
            #to all vertices
            for key, value in log_graph.e_centrality():
                log_graph.tuple_graph.vs.find(name=key)['size'] = value*20

            # Se guarda el grafo en un gml
            log_graph.tuple_graph.write_gml(f=screen_name+".gml")

            # Se almacenan imágenes de los grafos:
            
            style = {}
            style["edge_curved"] = False
            style["vertex_label"] = log_graph.tuple_graph.vs['name']
            style["vertex_label_size"] = 5

            igraph.plot(log_graph.tuple_graph, **style, target = screen_name+".png")
            
            
        except:
            print(screen_name+" falló")
            continue

    index+=1
    print("*Se ha completado el user " + screen_name +"*")
    
print(len(df))
# Se va a crear un 'SNlabelsV3.json' donde se mantienen aquellas cuentas de las que se ha creado el grafo
df
df.to_json('SNlabelsV3.json', orient = 'split', compression = 'infer', index = 'true')

## 4. Entrenamiento del modelo

### 4.1 Creación de grafos

Se crea el grafo embedded para cada @user (bot o humano). Se guarda en un csv:

In [11]:
# cambiar por SNlabelsV3.json cuando se tenga para menos iteraciones

dfm = None

with open('SNlabelsSANEADO.json', 'r') as f:
    data = json.load(f)


for i in data:
    try:
        screen_name = i['screen_name']
        #insert a line manually labeling 
        #each as a multigraph with this very messy chunk of code.
        igraph_gml = open(screen_name+".gml", 'r')
        lof = igraph_gml.readlines()
        
        if lof[4]!="multigraph 1":
          lof.insert(4, "multigraph 1\n")
        igraph_gml = open(screen_name + '.gml', 'w')
        
        lof = "".join(lof)
        igraph_gml.write(lof)
        igraph_gml.close()
        
        #Next, read the GML with NetworkX, then convert each
        #node from being labeled by name to being labeled by sequential
        #integers, since Graph2Vec requires nodes to be labeled this way
        H = nx.read_gml(screen_name + '.gml', label='name')
        convertedgraph = nx.convert_node_labels_to_integers(H)
        
        #Instantiate a Graph2Vec embedding model. There are
        #a variety of parameters that can be changed when 
        #instantiating the model (see the above link to the Karate Club library),
        #but I found 64 feature columns and otherwise default parameters
        #to provide the best results
        embedding_model = Graph2Vec(dimensions=64)

        #Now, fit the model to the NetworkX graph, and store the embedding
        #in a pandas DataFrame
        embedding_model.fit([convertedgraph])
        embeddingsframe = pd.DataFrame(embedding_model.get_embedding())
        embeddingsframe['64'] = i['label']

        #print(embeddingsframe)
        if dfm is None:
            dfm = embeddingsframe
        else:
            dfm = dfm.append(embeddingsframe, ignore_index = True, sort=False)
    except:
        continue

dfm
dfm.to_csv(r'../vectors_and_labels.csv', index=False)

In [12]:
dfm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0.012712,0.014901,0.012703,-0.012195,-0.013502,-0.007205,-0.006814,0.007326,-0.010604,-0.013454,...,0.013784,0.005810,-0.014770,-0.003902,0.005710,-0.000912,0.007801,0.005611,0.007043,0
1,0.291312,0.341479,0.291104,-0.279474,-0.309417,-0.165109,-0.156161,0.167894,-0.243019,-0.308323,...,0.315874,0.133135,-0.338477,-0.089411,0.130846,-0.020889,0.178775,0.128592,0.161396,1
2,0.023448,0.027486,0.023431,-0.022495,-0.024905,-0.013290,-0.012569,0.013514,-0.019561,-0.024817,...,0.025425,0.010716,-0.027244,-0.007197,0.010532,-0.001681,0.014390,0.010350,0.012991,0
3,0.161980,0.189875,0.161864,-0.155398,-0.172047,-0.091807,-0.086831,0.093355,-0.135127,-0.171439,...,0.175637,0.074028,-0.188205,-0.049716,0.072755,-0.011615,0.099405,0.071502,0.089742,1
4,0.016057,0.018822,0.016045,-0.015404,-0.017055,-0.009101,-0.008607,0.009254,-0.013395,-0.016994,...,0.017411,0.007338,-0.018656,-0.004928,0.007212,-0.001151,0.009854,0.007088,0.008896,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1907,0.023691,0.027771,0.023674,-0.022729,-0.025164,-0.013428,-0.012700,0.013654,-0.019764,-0.025075,...,0.025689,0.010827,-0.027527,-0.007272,0.010641,-0.001699,0.014539,0.010458,0.013126,0
1908,0.012976,0.015211,0.012967,-0.012449,-0.013783,-0.007355,-0.006956,0.007479,-0.010825,-0.013734,...,0.014070,0.005930,-0.015077,-0.003983,0.005828,-0.000930,0.007963,0.005728,0.007189,1
1909,0.445575,0.522309,0.445258,-0.427470,-0.473268,-0.252542,-0.238855,0.256802,-0.371709,-0.471596,...,0.483143,0.203637,-0.517716,-0.136759,0.200135,-0.031951,0.273445,0.196688,0.246863,1
1910,0.012581,0.014748,0.012572,-0.012070,-0.013363,-0.007131,-0.006744,0.007251,-0.010496,-0.013316,...,0.013642,0.005750,-0.014618,-0.003862,0.005651,-0.000902,0.007721,0.005554,0.006970,0


### 4.2 Entrenamiento del modelo

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

dataset = pd.read_csv('../tfe (copia)/vectors_and_labels.csv')

#Splitting data into input (vectors) and target (labels) datasets
X = dataset.drop(columns=['64'])
Y = dataset['64']
#print(df.iloc[0])
#print(X.iloc[0])
#type(X.iloc[0])

#Splitting these datasets further into training and test datasets.
#This prevents contamination, i.e. the model learning from the labels
#it's later going to try to predict. If you had the answers to a test
#while you were studying for it, your grade on the test wouldn't really
#be an accurate representation of how well you'd studied, would it? , early_stopping_rounds = 10
seed = 42
test_size = 0.4
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

#Create a classification model with set parameters, and fit it to the
#training data. This can take a little while, depending on your parameters
model = xgb.XGBClassifier(objective="binary:hinge", random_state=42, learning_rate = 0.05, n_estimators = 5000, early_stopping_rounds = 10)
model.fit(X_train, y_train)

#print(model)

#Test your model's accuracy by making a prediction on the test set
#that we've separated, and print that accuracy score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)




In [None]:
# first neural network with keras tutorial
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

## 5. Predicción para un usuario

### 5.1 Creación del grafo (csv y gml) del usuario

Se crea el grafo del usuario y se procesa para poder realizar la predicción:

In [108]:
screen_name = "Killensteak "

In [None]:
t = TweetGrabber(
    myApi = "",
    sApi = "",
    at = "",
    sAt = "")



t.user_search(user=screen_name, csv_prefix=screen_name)

userFrame = pd.read_csv(screen_name + ".csv")

r = RetweetParser(userFrame, screen_name)

log_graph = TweetGraph(edge_list= screen_name + ".csv")

for key, value in log_graph.e_centrality():
    log_graph.tuple_graph.vs.find(name=key)['size'] = value*20

log_graph.tuple_graph.write_gml(f=screen_name+".gml")

# Representación:

style = {}
style["edge_curved"] = False
style["vertex_label"] = log_graph.tuple_graph.vs['name']
style["vertex_label_size"] = 5

igraph.plot(log_graph.tuple_graph, **style, target = screen_name+".png")

In [111]:
igraph_gml = open(screen_name+".gml", 'r')
lof = igraph_gml.readlines()
igraph_gml.close()
if lof[4]!="multigraph 1":
  lof.insert(4, "multigraph 1\n")
igraph_gml = open(screen_name + '.gml', 'w')
lof = "".join(lof)
igraph_gml.write(lof)
igraph_gml.close()
H = nx.read_gml(screen_name + '.gml', label='name')
convertedgraph = nx.convert_node_labels_to_integers(H)

embedding_model = Graph2Vec(dimensions=64)

embedding_model.fit([convertedgraph])
embeddingsframe = pd.DataFrame(embedding_model.get_embedding())

In [None]:
embeddingsframe

In [None]:
screen_name

In [None]:
pred = model.predict(embeddingsframe)
print(screen_name + ': ' + str(pred[0]))