In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import os

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hasibullah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hasibullah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hasibullah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:

file_path = 'casey-next-data-short-survey.csv'

try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    # If utf-8 fails, use a fallback encoding
    df = pd.read_csv(file_path, encoding='latin1')  # Common fallback

# Data Cleaning

## Handle missing values

In [5]:
# Check for missing values
print(df.isnull().sum())

# replacing missing text entries with an empty string
text_columns = [
    'What kind of place would you like Casey to be in 2041?',
    'If you had the power to change just one thing in the City of Casey what would it be?',
    'What three words would you use to describe your Vision for the City of Casey?',
    'What\'s most important to you?',
    'Other'
]

for col in text_columns:
    df[col] = df[col].fillna('')

# Drop rows with missing demographic data if necessary
df.dropna(subset=['What is your suburb?', 'Postcode', 'Ward', 'What is your age?', 'What is your Gender'], inplace=True)


What kind of place would you like Casey to be in 2041?                                    17
If you had the power to change just one thing in the City of Casey what would it be?      36
What three words would you use to describe your Vision for the City of Casey?            124
Unnamed: 3                                                                               400
Unnamed: 4                                                                               642
What's most important to you?                                                           1580
Other                                                                                   2414
What is your suburb?                                                                      14
Postcode                                                                                 688
Ward                                                                                       5
What is your age?                                                     

## Normalize Text Data

In [6]:
# Convert text to lowercase
for col in text_columns:
    df[col] = df[col].str.lower()

# Text Pre-processing
## Define Pre-processing function

In [7]:
# Initialize NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into string
    return ' '.join(tokens)

## Apply the pr-processing function


In [8]:
for col in text_columns:
    df[col] = df[col].apply(preprocess_text)

## Concatenate Text Fields

In [9]:
# Concatenate all text fields into one column
df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)

## Prepare Data for LLM Pre-training

In [10]:
# Save combined text to a file
output_file = 'casey_next_pretraining_corpus.txt'

# Write to file
with open(output_file, 'w', encoding='utf-8') as f:
    for text in df['combined_text']:
        f.write(text + '\n')


In [11]:
# Check the number of lines written
num_lines = sum(1 for line in open(output_file, 'r', encoding='utf-8'))
print(f"Number of lines in the pretraining corpus: {num_lines}")

Number of lines in the pretraining corpus: 1757


In [12]:
# Save metadata to a separate file
metadata = df[['What is your suburb?', 'Postcode', 'Ward', 'What is your age?', 'What is your Gender']]
metadata.to_csv('casey_next_metadata.csv', index=False)
