# Kickstarter Campaign Data Preprocessing
This notebook cleans and prepares Kickstarter campaign data for NLP modeling.

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle


In [None]:
# Load dataset
df = pd.read_csv(url)
df = df[['blurb', 'final_status']].dropna().drop_duplicates()
df.head()

In [None]:
# Clean and normalize text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_blurb'] = df['blurb'].apply(clean_text)
df.head()

In [None]:
# Tokenize and pad sequences
MAX_LEN = 64
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_blurb'])
sequences = tokenizer.texts_to_sequences(df['clean_blurb'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

In [None]:
# Encode labels and split dataset
df['label'] = df['final_status'].apply(lambda x: 1 if x == 'successful' else 0)
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'], test_size=0.2, random_state=42)

In [None]:
# Save outputs to file
with open("preprocessing_outputs.pkl", "wb") as f:
    pickle.dump({
        "tokenizer": tokenizer,
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train.values,
        "y_test": y_test.values
    }, f)