In [39]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import gensim.downloader as api
glove_model = api.load("glove-wiki-gigaword-300")
embedding_dim = glove_model.vector_size

In [40]:
print(embedding_dim)

300


In [58]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK resources
nltk.download('punkt')

# Load the first Excel file (news-pal.xlsx)
url_pal = "https://github.com/rosenfa/ai/blob/master/news-pal.xlsx?raw=true"
df_pal = pd.read_excel(url_pal)

# Load the second Excel file (news-israel.xlsx)
url_israel = "https://github.com/rosenfa/ai/blob/master/news-israel.xlsx?raw=true"
df_israel = pd.read_excel(url_israel)

df_israel.dropna(inplace=True)
df_pal.dropna(inplace=True)


# Add a 'target' column to both dataframes
df_pal['target'] = False  # Assuming 'false' for news-pal.xlsx
df_israel['target'] = True   # Assuming 'true' for news-israel.xlsx


# Concatenate the two dataframes
df_combined = pd.concat([df_pal, df_israel], ignore_index=True)
df_combined.dropna(inplace=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Assuming df_combined is your DataFrame

# Define a function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in ENGLISH_STOP_WORDS]
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the 'Content' column
df_combined['Content'] = df_combined['Content'].apply(remove_stopwords)


In [43]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and labels (y)
X = df_combined['Content']
y = df_combined['target']  # Replace 'target_column' with the actual name of your target column

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you have your training and test sets ready to be used for further processing or modeling


In [44]:
from keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)

In [45]:
X_tra = vectorizer(np.array([[s] for s in X_train])).numpy()
X_te = vectorizer(np.array([[s] for s in X_test])).numpy()

In [46]:
vectorizer.get_vocabulary()[:10]

['',
 '[UNK]',
 'gaza',
 'israeli',
 'israel',
 'hamas',
 'palestinian',
 'war',
 'hospital',
 'israels']

In [47]:
output = vectorizer([["the gaza fired on the man"]])
output.numpy()[0, :6]

array([ 263,    2,  548, 1176,  263,  177])

In [48]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [49]:
test = ["gaza", "fired", "on", "israel"]

In [50]:
vocab_size = len(word_index)
num_tokens = len(voc) + 2
embedding_matrix = np.zeros((num_tokens, embedding_dim))

hits = 0
misses = 0

# Prepare embedding matrix
for word, i in word_index.items():
    if word in glove_model:
      hits += 1
      embedding_matrix[i] = glove_model[word]
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 3479 words (756 misses)


In [51]:
# print(word_index.items())
# print(print(embedding_matrix[10])) #Note that UNK is like OOV from the Lecture

In [52]:
from keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [53]:
from keras import layers

def createModel():
  int_sequences_input = keras.Input(shape=(None,), dtype="int64")
  embedded_sequences = embedding_layer(int_sequences_input)
  x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
  x = layers.MaxPooling1D(5)(x)
  x = layers.Conv1D(128, 5, activation="relu")(x)
  x = layers.MaxPooling1D(5)(x)
  x = layers.Conv1D(128, 5, activation="relu")(x)
  x = layers.GlobalMaxPooling1D()(x)
  x = layers.Dense(128, activation="relu")(x)
  x = layers.Dropout(0.5)(x)
  preds = layers.Dense(2, activation="sigmoid")(x)
  model = keras.Model(int_sequences_input, preds)
  model.summary()
  return model

In [54]:
model = createModel()
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"],
)
model.fit(X_tra, y_train, batch_size=128, epochs=30, validation_data=(X_te, y_test))

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 300)         1271100   
                                                                 
 conv1d_6 (Conv1D)           (None, None, 128)         192128    
                                                                 
 max_pooling1d_4 (MaxPoolin  (None, None, 128)         0         
 g1D)                                                            
                                                                 
 conv1d_7 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_5 (MaxPoolin  (None, None, 128)         0         
 g1D)                                                      

<keras.src.callbacks.History at 0x78ec1e4c1990>

**Try to explain why this model did as good as it did (really hard to do!!!)  Please add a document and push it separately or add a long text box at the end when you explain what features were in this model and why they did what they to give you your accuracy. **

**The features in this model that help give such a high accuracy was firstly, the transfer learning. We have a model that has already proves succesful. Secondly, we changed the final layer to a sigmoid function for binary classification. Thirdly, we removed teh stopwords which also helped us remove unecessary information which can influence our model negativley.**

Now use Chi squared

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# Assuming df_combined is your DataFrame with a column 'Content'
content_data = df_combined['Content']

# Convert text data to a matrix of token counts
vz = CountVectorizer()
X = vz.fit_transform(content_data)

# Apply Chi2 feature selection to get the top 4000 most important words
selector = SelectKBest(score_func=chi2, k=4000)
X_selected = selector.fit_transform(X, df_combined['target'])  # Assuming you have a target column, change it accordingly

# Get the indices of the top 4000 words
selected_word_indices = selector.get_support(indices=True)

# Get the names of the selected words
selected_words = [vz.get_feature_names_out()[i] for i in selected_word_indices]

# Function to filter words in a string
def filter_content(content):
    filtered_words = [word for word in content.split() if word in selected_words]
    return ' '.join(filtered_words)

# Apply the filter_content function to the 'Content' column
df_combined['Content'] = df_combined['Content'].apply(filter_content)

# Split the data into features (X) and labels (y)
X = df_combined['Content']
y = df_combined['target']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_tra = vectorizer(np.array([[s] for s in X_train])).numpy()
X_te = vectorizer(np.array([[s] for s in X_test])).numpy()

In [56]:
model = createModel()
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"],
)
model.fit(X_tra, y_train, batch_size=128, epochs=30, validation_data=(X_te, y_test))

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 300)         1271100   
                                                                 
 conv1d_9 (Conv1D)           (None, None, 128)         192128    
                                                                 
 max_pooling1d_6 (MaxPoolin  (None, None, 128)         0         
 g1D)                                                            
                                                                 
 conv1d_10 (Conv1D)          (None, None, 128)         82048     
                                                                 
 max_pooling1d_7 (MaxPoolin  (None, None, 128)         0         
 g1D)                                                      

<keras.src.callbacks.History at 0x78ec05549720>