In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import gensim.downloader as api
glove_model = api.load("glove-wiki-gigaword-200")
embedding_dim = glove_model.vector_size



In [None]:
print(embedding_dim)

200


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK resources
nltk.download('punkt')

# Load the first Excel file (news-pal.xlsx)
url_pal = "https://github.com/rosenfa/ai/blob/master/news-pal.xlsx?raw=true"
df_pal = pd.read_excel(url_pal)

# Load the second Excel file (news-israel.xlsx)
url_israel = "https://github.com/rosenfa/ai/blob/master/news-israel.xlsx?raw=true"
df_israel = pd.read_excel(url_israel)

df_israel.dropna(inplace=True)
df_pal.dropna(inplace=True)


# Add a 'target' column to both dataframes
df_pal['target'] = False  # Assuming 'false' for news-pal.xlsx
df_israel['target'] = True   # Assuming 'true' for news-israel.xlsx


# Concatenate the two dataframes
df_combined = pd.concat([df_pal, df_israel], ignore_index=True)
df_combined.dropna(inplace=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Assuming df_combined is your DataFrame

# Define a function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the 'Content' column
df_combined['Content'] = df_combined['Content'].apply(remove_stopwords)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and labels (y)
X = df_combined['Content']
y = df_combined['target']  # Replace 'target_column' with the actual name of your target column

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you have your training and test sets ready to be used for further processing or modeling


In [None]:
from keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)

In [None]:
X_train= vectorizer(np.array([[s] for s in X_train])).numpy()
X_test= vectorizer(np.array([[s] for s in X_test])).numpy()

In [None]:
vectorizer.get_vocabulary()[:10]

['',
 '[UNK]',
 'gaza',
 'israeli',
 'israel',
 'hamas',
 'palestinian',
 'war',
 'hospital',
 'israels']

In [None]:
output = vectorizer([["the gaza fired on the man"]])
output.numpy()[0, :6]

array([ 263,    2,  548, 1176,  263,  177])

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
test = ["gaza", "fired", "on", "israel"]

In [None]:
vocab_size = len(word_index)
num_tokens = len(voc) + 2
embedding_matrix = np.zeros((num_tokens, embedding_dim))

hits = 0
misses = 0

# Prepare embedding matrix
for word, i in word_index.items():
    if word in glove_model:
      hits += 1
      embedding_matrix[i] = glove_model[word]
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 3479 words (756 misses)


In [None]:
# print(word_index.items())
# print(print(embedding_matrix[10])) #Note that UNK is like OOV from the Lecture

In [None]:
from keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(2, activation="sigmoid")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 200)         847400    
                                                                 
 conv1d_3 (Conv1D)           (None, None, 128)         128128    
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, None, 128)         0         
 g1D)                                                            
                                                                 
 conv1d_4 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_3 (MaxPoolin  (None, None, 128)         0         
 g1D)                                                      

In [None]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"],
)
model.fit(X_train, y_train, batch_size=128, epochs=30, validation_data=(X_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7ee13c1fe980>