In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle


# cargar los dos sets de tweets en dataframes
df1 = pd.read_excel('IsisFanboy.xlsx')
df1bis = pd.read_excel('islamicNewsClean.xlsx')

df1 = pd.concat([df1,df1bis])

df2 = pd.read_excel('randomClean.xlsx')



# agregar una columna 'tema' que identifique el tema de cada set de tweets
df1['tema'] = 'terrorismo'
df2['tema'] = 'otro'

# concatenar los dataframes
df = pd.concat([df1, df2])

print(df.head())  # debug

# dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df['Contenido'].values.astype(str), df['tema'], test_size=0.2, random_state=42)


# crear un vectorizador Tfidf para convertir el texto a vectores numéricos
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# crear un modelo de regresión logística
model = LogisticRegression()

# entrenar el modelo con los datos de entrenamiento
model.fit(X_train_vec, y_train)

# predecir los temas de los tweets de prueba
y_pred = model.predict(X_test_vec)

# evaluar la precisión del modelo
accuracy = accuracy_score(y_test, y_pred)
print('\n ---------------------------------------- \nPrecisión del modelo:', accuracy)

# Guardar el modelo en un archivo 'model.pkl'
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Guardar el vectorizador en un archivo 'vectorizer.pkl'
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)


            Fecha          Usuario Displayed name  \
0  1/6/2015 21:07  GunsandCoffee70  GunsandCoffee   
1  1/6/2015 21:27  GunsandCoffee70  GunsandCoffee   
2  1/6/2015 21:29  GunsandCoffee70  GunsandCoffee   
3  1/6/2015 21:37  GunsandCoffee70  GunsandCoffee   
4  1/6/2015 21:45  GunsandCoffee70  GunsandCoffee   

                                           Contenido Ubicacion        tema  
0  'A MESSAGE TO THE TRUTHFUL IN SYRIA - SHEIKH A...       NaN  terrorismo  
1  SHEIKH FATIH AL JAWLANI 'FOR THE PEOPLE OF INT...       NaN  terrorismo  
2  FIRST AUDIO MEETING WITH SHEIKH FATIH AL JAWLA...       NaN  terrorismo  
3  SHEIKH NASIR AL WUHAYSHI (HA), LEADER OF AQAP:...       NaN  terrorismo  
4  AQAP: 'RESPONSE TO SHEIKH BAGHDADIS STATEMENT ...       NaN  terrorismo  

 ---------------------------------------- 
Precisión del modelo: 0.9109201015823403


In [3]:
def clasificar_tweets(lista_tweets):
    # crear vectores numéricos a partir del texto de los tweets
    lista_vec = vectorizer.transform(lista_tweets)

    # hacer predicciones de la clase de los tweets
    predicciones = model.predict(lista_vec)

    # identificar cuáles tweets son de la categoría "es_terrorismo"
    es_terrorismo = predicciones == 'otro'

    # devolver los tweets clasificados como "terrorismo"
    return [t for i, t in enumerate(lista_tweets) if es_terrorismo[i]]

tweets = pd.read_excel("20230315_235314_tweets.xlsx")
print(len(tweets))
tweets = tweets['Contenido'].values.astype(str)
tweets_terrorismo = clasificar_tweets(tweets)
print(len(tweets_terrorismo))

for i, tweet in enumerate(tweets_terrorismo, start=1):
    print(f"{i}. {tweet}")


1000
321
1. You need a good 7 minute track for these walk outs. It's about a mile to the octagon #UFC193
2. bare in mind it was Friday the 13th yesterday.
3. Dirty martini type of night. Happy birthday to my FAV @blamelex 💃🏼❤️😘 @ 45 Jermyn Street London https://t.co/zWt7hjqOw8
4. got a £5 tip tonight which is pretty good for that place
5. Loving Sherbet #DipDabs two more wins with them, this time Spotify vouchers, Wayhey!!
6. We arrive home safe! Thank you @Eurostar for assisting and dealing with this change so well. Grateful
7. @TheCoutinhoRole No brainer: Suarez.
Should have included assist stats.
#LFC
8. @hubcycleworks 😂 Lol Classic! 🚴👍🏼
9. @AndybBenson night 😘
10. J Cole's music is actually flawless. I never skip a Cole song
11. @elthamians Thank you for a terrific reunion tonight. Always lovely to lurk around Central Hall! @simonmcgrahan
12. Hope you're up there partying Steve! You're a guy that will never be forgotten and will always be in our hearts, RIP Steve, never forgotten!
