# **Text Cleaning, Tokenization, and Data Merging Pipeline**

### Description:
This script processes two datasets—`scripts` and `final_summary`—by cleaning the text (removing special characters and extra spaces), tokenizing the summaries using the RoBERTa tokenizer, and merging them based on matching columns. The final output is saved to a new CSV file, combining the relevant information from both datasets for further analysis or model input.

In [None]:
import pandas as pd
import re
from transformers import RobertaTokenizer

# Load the datasets
scripts = pd.read_csv('processed_scripts.csv')
final_summary = pd.read_csv('final_summary.csv')

# Drop unwanted columns
scripts = scripts.drop(columns=["Unnamed: 0"], errors='ignore')
final_summary = final_summary.drop(columns=["Unnamed: 0"], errors='ignore')

# Function to clean text
def clean_text(text):
    # Remove special characters, digits, and multiple spaces
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Keep alphanumeric and spaces
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Apply cleaning to the 'final_summary' column
final_summary['cleaned_summary'] = final_summary['final_summary'].apply(clean_text)

# Initialize RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize cleaned summaries
tokenized_data = final_summary['cleaned_summary'].apply(
    lambda x: tokenizer(
        x,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
)

# Extract and store token IDs and attention masks
final_summary['input_ids'] = tokenized_data.apply(lambda x: x['input_ids'].squeeze().tolist())
final_summary['attention_mask'] = tokenized_data.apply(lambda x: x['attention_mask'].squeeze().tolist())

# Tokenize text and extract tokens
def tokenize_text(text):
    tokens = tokenizer.tokenize(text)  # Tokenize text into subwords
    return tokens

# Create a new column 'processed_final_summary' with tokenized text
final_summary['processed_final_summary'] = final_summary['final_summary'].apply(tokenize_text)

# Save the processed final summary to a CSV file
final_summary.to_csv('processed_final_summary.csv', index=False)

# Merge the scripts and final_summary DataFrames on the appropriate columns
merged_df = pd.merge(
    scripts,
    final_summary,
    left_on="Movie Name",  # Key from the first dataset
    right_on="film_id",    # Key from the second dataset
    how="outer"            # Full join
)

# Check the merged DataFrame
print(merged_df)

# Save to CSV if needed
merged_df.to_csv('merged_dataset.csv', index=False)

# Check for missing values in the merged dataset
missing_values = merged_df.isnull().sum()
print(missing_values)
