In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
import spacy
import random

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:

valid = pd.read_csv("/content/drive/MyDrive/cnnvalidation.csv")
test = pd.read_csv("/content/drive/MyDrive/cnntest.csv")
train = pd.read_csv("/content/drive/MyDrive/cnntrain.csv")

In [None]:
train.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:


nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokenized_sentences = [str(sent) for sent in doc.sents]
    return tokenized_sentences


def preprocess_dataset():
  train_article= train["article"][:100].apply(preprocess_text)
  train_highlights= train["highlights"][:100].apply(preprocess_text)
  test_article= test["article"][:100].apply(preprocess_text)
  test_highlights= test["highlights"][:100].apply(preprocess_text)
  valid_article= valid["article"][:100].apply(preprocess_text)
  valid_highlights= valid["highlights"][:100].apply(preprocess_text)
  return train_article, train_highlights, test_article, test_highlights, valid_article, valid_highlights



train_article, train_highlights, test_article, test_highlights, valid_article, valid_highlights = preprocess_dataset()

In [None]:

# Function to create training data
def create_training_data(article, highlights):
    positive_samples = [(sentence, 1) for sentence in article]
    negative_samples = [(sentence, 0) for sentence in article if sentence not in highlights]

    # Combine positive and negative samples and shuffle
    all_samples = positive_samples + negative_samples
    random.shuffle(all_samples)

    sentences, labels = zip(*all_samples)
    return sentences, labels

def prepare_training_data(train_article, train_highlights,valid_article, valid_highlights):
  train_sentences, train_labels = [], []
  for i in range(100):
    sentences, labels = create_training_data(train_article[i], train_highlights[i])
    train_sentences.extend(sentences)
    train_labels.extend(labels)

# Prepare validation data (similar to training data)
# Note: Validation data should be separate from training data
  val_sentences, val_labels = [], []

  for i in range(100):
    sentences, labels = create_training_data(valid_article[i], valid_highlights[i])
    val_sentences.extend(sentences)
    val_labels.extend(labels)
  return train_sentences, train_labels, val_sentences,val_labels

train_sentences, train_labels, val_sentences,val_labels = prepare_training_data(train_article, train_highlights,valid_article, valid_highlights)

In [None]:





# Define the model architecture
#max_sequence = max(len(i) for i in (train_article+train_highlights))
#input_layer = Input(shape=(149,100))  # Variable input shape

#bi_lstm = Bidirectional(LSTM(128, return_sequences=True))(input_layer)
#output_layer = Dense(1, activation='sigmoid')(bi_lstm)

#model = tf.keras.Model(inputs=input_layer, outputs=output_layer)


def create_model():
  input_layer = Input(shape=(100,))  # Adjust the input shape based on your padded sequence length
  embedding_layer = Embedding(input_dim=1000, output_dim=128)(input_layer)  # Adjust input_dim based on tokenizer settings
  bi_lstm = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)
  output_layer = Dense(1, activation='sigmoid')(bi_lstm)

  model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model


In [None]:

def tokenizing_data(train_sentences,val_sentences, train_labels, val_labels):
  tokenizer = tf.keras.layers.TextVectorization(max_tokens=1000, output_sequence_length=100)
  tokenizer.adapt(np.array(train_sentences))
#print(train_sentences)
  train_sequences = tokenizer(np.array(train_sentences))
  val_sequences = tokenizer(np.array(val_sentences))
#print(type(train_sequences))
  train_labels = np.array(train_labels)
  val_labels = np.array(val_labels)
#print(type(val_labels))

  train_labels = train_labels.reshape(-1, 1)
  val_labels = val_labels.reshape(-1, 1)



# Assuming train_sequences and val_sequences are lists of sequences
  train_sequences = pad_sequences(train_sequences, maxlen=100, padding='post', truncating='post')
  val_sequences = pad_sequences(val_sequences, maxlen=100, padding='post', truncating='post')
  return train_sequences,val_sequences


In [None]:
def train_model(train_sequences, val_sequences,model):
  batch_size = 32
  epochs = 10

  history = model.fit(
      train_sequences, train_labels,
      validation_data=(val_sequences, val_labels),
      batch_size=batch_size,
      epochs=epochs,
      verbose=1
  )
  return history

In [None]:
def evaluate_model(model,val_sequences,val_labels):
  evaluation = model.evaluate(val_sequences, val_labels)

# The `evaluation` variable now contains the loss and accuracy
  loss = evaluation[0]
  accuracy = evaluation[1]

# Print the accuracy
  print("Validation Loss:", loss)
  print("Validation Accuracy:", accuracy)

In [None]:
def main():
  train_article, train_highlights, test_article, test_highlights, valid_article, valid_highlights = preprocess_dataset()
  train_sentences, train_labels, val_sentences,val_labels = prepare_training_data(train_article, train_highlights,valid_article, valid_highlights)
  model = create_model()
  train_sequences,val_sequences = tokenizing_data(train_sentences,val_sentences, train_labels, val_labels)
  train_model(train_sequences, val_sequences,model)
  evaluate_model(model,val_sequences, val_labels)
main()