In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
required_nltk_resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4']

def download_nltk_resources():
    """Download required NLTK resources with error handling."""
    for resource in required_nltk_resources:
        try:
            print(f"Downloading {resource}...")
            nltk.download(resource, quiet=True)
            print(f"Successfully downloaded {resource}")
        except Exception as e:
            print(f"Error downloading {resource}: {e}")
            raise

            
# Download resources
try:
    download_nltk_resources()
except Exception as e:
    print(f"Failed to download required NLTK resources: {e}")
    raise

    
# Load the data
try:
    features_df = pd.read_csv('data/train_features.csv')
    labels_df = pd.read_csv('data/train_labels.csv')
except FileNotFoundError as e:
    print(f"Error loading data files: {e}")
    raise

 

Downloading punkt...
Successfully downloaded punkt
Downloading stopwords...
Successfully downloaded stopwords
Downloading wordnet...
Successfully downloaded wordnet
Downloading omw-1.4...
Successfully downloaded omw-1.4


In [2]:
   
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess text by converting to lowercase, removing special characters,
    tokenizing, removing stop words, and lemmatizing.
    
    Args:
        text (str): Input text to preprocess
        
    Returns:
        str: Preprocessed text
    """
    try:
        # Convert to lowercase and handle non-string input
        text = str(text).lower()
        
        # Remove special characters and punctuation
        text = re.sub(r'[^\w\s]', '', text)
        
        # Tokenize the text
        tokens = word_tokenize(text)
        
        # Remove stop words and lemmatize
        processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        
        return ' '.join(processed_tokens)
    except Exception as e:
        print(f"Error preprocessing text: {e}")
        return ""


In [3]:

try:
    # Combine and preprocess NarrativeLE and NarrativeCME
    print("Starting text preprocessing...")
    features_df['processed_narrative'] = (features_df['NarrativeLE'].fillna('') + ' ' + 
                                        features_df['NarrativeCME'].fillna(''))
    
    
    # Process texts with progress indicator
    total_rows = len(features_df)
    for idx, row in enumerate(features_df['processed_narrative']):
        if idx % 1000 == 0:  # Print progress every 1000 rows
            print(f"Processing row {idx}/{total_rows}")
        features_df.at[idx, 'processed_narrative'] = preprocess_text(row)

        
    # Merge features and labels
    print("\nMerging features and labels...")
    merged_df = pd.merge(features_df, labels_df, on='uid')

    
    # Display the first few rows of the processed data
    print("\nFirst few rows of processed data:")
    print(merged_df[['uid', 'processed_narrative']].head())

    
    # Save the processed data
    print("\nSaving processed data...")
    merged_df.to_csv('processed_data.csv', index=False)
    print("Data preparation complete. Processed data saved to 'processed_data.csv'.")

except Exception as e:
    print(f"Error in data processing pipeline: {e}")
    

Starting text preprocessing...
Processing row 0/4000
Processing row 1000/4000
Processing row 2000/4000
Processing row 3000/4000

Merging features and labels...

First few rows of processed data:
    uid                                processed_narrative
0  aaaf  v xx xx shot motor vehiclethe v mother called ...
1  aaby  v xxxx v found basement residence hanging stra...
2  aacl  v xxxx v found residence unresponsive result g...
3  aacn  victim xx xx recently returned village residin...
4  aadb  xx xx v found deceased home grandparent hangin...

Saving processed data...
Data preparation complete. Processed data saved to 'processed_data.csv'.


In [4]:
import pandas as pd
import numpy as np
import os

def prepare_submission(processed_data_path, submission_format_path='data\submission_format.csv'):
    """
    Prepare processed data according to the required submission format with enhanced NA handling.
    """
    try:
        # Load the processed data and submission format
        print("Loading data files...")
        processed_df = pd.read_csv(processed_data_path)
        submission_template = pd.read_csv(submission_format_path)
        
        print(f"Processed data shape: {processed_df.shape}")
        print(f"Template shape: {submission_template.shape}")
        
        # Check for NA values in processed data
        print("\nNA values in processed data:")
        print(processed_df.isna().sum())
        
        binary_columns = [
            'DepressedMood', 'MentalIllnessTreatmentCurrnt', 'HistoryMentalIllnessTreatmnt',
            'SuicideAttemptHistory', 'SuicideThoughtHistory', 'SubstanceAbuseProblem',
            'MentalHealthProblem', 'DiagnosisAnxiety', 'DiagnosisDepressionDysthymia',
            'DiagnosisBipolar', 'DiagnosisAdhd', 'IntimatePartnerProblem',
            'FamilyRelationship', 'Argument', 'SchoolProblem', 'RecentCriminalLegalProblem',
            'SuicideNote', 'SuicideIntentDisclosed', 'DisclosedToIntimatePartner',
            'DisclosedToOtherFamilyMember', 'DisclosedToFriend'
        ]
        
        categorical_columns = ['InjuryLocationType', 'WeaponType1']
        
        # Create submission dataframe with same structure as template
        submission_df = pd.DataFrame()
        submission_df['uid'] = submission_template['uid']
        
        # Handle binary columns with more aggressive NA handling
        print("\nProcessing binary columns...")
        for col in binary_columns:
            try:
                if col in processed_df.columns:
                    # Replace any non-finite values with 0
                    processed_df[col] = processed_df[col].replace([np.inf, -np.inf], np.nan)
                    processed_df[col] = processed_df[col].fillna(0)
                    # Convert to integer through boolean to ensure 0/1 values
                    submission_df[col] = processed_df[col].astype(float).fillna(0).astype(bool).astype(int)
                    print(f"Processed {col} - unique values: {submission_df[col].unique()}")
                else:
                    print(f"Warning: Column {col} not found in processed data")
                    submission_df[col] = 0
            except Exception as e:
                print(f"Error processing binary column {col}: {e}")
                raise
        
        # Handle categorical columns with more aggressive NA handling
        print("\nProcessing categorical columns...")
        for col in categorical_columns:
            try:
                if col in processed_df.columns:
                    # Replace any non-finite values
                    processed_df[col] = processed_df[col].replace([np.inf, -np.inf], np.nan)
                    
                    if col == 'InjuryLocationType':
                        processed_df[col] = processed_df[col].fillna(1)
                        submission_df[col] = processed_df[col].astype(float).fillna(1).astype(int)
                        submission_df.loc[~submission_df[col].between(1, 6), col] = 1
                    elif col == 'WeaponType1':
                        processed_df[col] = processed_df[col].fillna(12)
                        submission_df[col] = processed_df[col].astype(float).fillna(12).astype(int)
                        submission_df.loc[~submission_df[col].between(1, 12), col] = 12
                    
                    print(f"Processed {col} - unique values: {submission_df[col].unique()}")
                else:
                    print(f"Warning: Column {col} not found in processed data")
                    submission_df[col] = 1 if col == 'InjuryLocationType' else 12
            except Exception as e:
                print(f"Error processing categorical column {col}: {e}")
                raise
        
        # Verify data types
        print("\nVerifying data types...")
        submission_df['uid'] = submission_df['uid'].astype(str)
        for col in binary_columns + categorical_columns:
            submission_df[col] = pd.to_numeric(submission_df[col], errors='coerce').fillna(0).astype(int)
        
        # Ensure column order matches template
        submission_df = submission_df[submission_template.columns]
        
        # Final verifications
        print("\nPerforming final verifications...")
        print("Checking for any remaining NA values:")
        print(submission_df.isna().sum())
        
        assert len(submission_df) == len(submission_template), "Submission has incorrect number of rows"
        
        return submission_df
    
    except Exception as e:
        print(f"Error preparing submission: {e}")
        raise



In [5]:
import os

def save_submission(submission_df, file_name='submission.csv'):
    """
    Save the submission dataframe to a CSV file. Clears the file if it's not empty.
    """
    try:
        # Check if the file exists and is not empty
        if os.path.exists(file_name):
            if os.path.getsize(file_name) > 0:
                print(f"Clearing contents of {file_name} as it is not empty...")
                open(file_name, 'w').close()  # Clear the file
        
        # Save the submission DataFrame to the cleared (or new) file
        submission_df.to_csv(file_name, index=False)
        print(f"Submission file saved as {file_name}")
    except Exception as e:
        print(f"Error saving submission: {e}")
        raise

# Example of calling the functions
try:
    print("Starting submission preparation...")
    print(f"Current working directory: {os.getcwd()}")

    # Check if files exist
    if not os.path.exists('processed_data.csv'):
        print("Warning: processed_data.csv not found in current directory")
    if not os.path.exists('submission_format.csv'):
        print("Warning: submission_format.csv not found in current directory")
    
    submission_df = prepare_submission('processed_data.csv')
    save_submission(submission_df)
    
except Exception as e:
    print(f"Error in submission pipeline: {e}")

    # Print detailed information about the data if available
    if 'submission_df' in locals():
        print("\nSubmission DataFrame Info:")
        print(submission_df.info())

# Check processed data if it exists
if os.path.exists('processed_data.csv'):
    df = pd.read_csv('processed_data.csv')
    print("\nProcessed data info:")
    print(df.info())


Starting submission preparation...
Current working directory: C:\Users\jpark\Downloads
Loading data files...
Processed data shape: (4000, 27)
Template shape: (1000, 24)

NA values in processed data:
uid                             0
NarrativeLE                     0
NarrativeCME                    0
processed_narrative             0
DepressedMood                   0
MentalIllnessTreatmentCurrnt    0
HistoryMentalIllnessTreatmnt    0
SuicideAttemptHistory           0
SuicideThoughtHistory           0
SubstanceAbuseProblem           0
MentalHealthProblem             0
DiagnosisAnxiety                0
DiagnosisDepressionDysthymia    0
DiagnosisBipolar                0
DiagnosisAdhd                   0
IntimatePartnerProblem          0
FamilyRelationship              0
Argument                        0
SchoolProblem                   0
RecentCriminalLegalProblem      0
SuicideNote                     0
SuicideIntentDisclosed          0
DisclosedToIntimatePartner      0
DisclosedToOtherFam