In [37]:
# imports

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk

import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JennyXu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JennyXu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
# Run this cell first to download the necessary NLTK data
import nltk
print("Downloading NLTK data packages...")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK data packages...
Downloads complete.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JennyXu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JennyXu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JennyXu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
# read data into dataframe

lspi_jira_csv_file_path = Path('data/lspi-jira-ticket-data.csv')

lspi_jira_raw = pd.read_csv(lspi_jira_csv_file_path)
lspi_jira_raw = lspi_jira_raw.iloc[:, :49]

  lspi_jira_raw = pd.read_csv(lspi_jira_csv_file_path)


In [40]:
import pandas as pd
import numpy as np
import re
import contractions
import math

# This script assumes you have a pandas DataFrame named 'lspi_jira_raw' already loaded in your environment.

# --- 1. The Simpler, NLTK-Free Cleaning Function ---
def simple_clean_text(text):
    """
    A simplified cleaning function that does NOT use NLTK.
    It focuses on removing machine-generated noise, which is ideal for LLM pre-processing.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\[\^.*?\]', '', text)
    text = re.sub(r'\[\~accountid:\w+\]', '', text)
    text = re.sub(r'\{color:.*?\}', '', text)
    text = re.sub(r'\[.*?\|.*?\]', '', text)
    text = re.sub(r'\!.*?\!', '', text)
    text = re.sub(r'h\d\.', '', text)
    text = re.sub(r'\{[^{}]*\}', '', text, flags=re.DOTALL)
    text = contractions.fix(text)
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    return cleaned_text

# --- 2. Apply Text Cleaning and Combine Text Columns ---
print("Cleaning and combining 'Summary' and 'Description' columns...")
lspi_jira_raw['Summary'] = lspi_jira_raw['Summary'].astype(str).fillna('')
lspi_jira_raw['Description'] = lspi_jira_raw['Description'].astype(str).fillna('')
lspi_jira_raw['Cleaned_Summary'] = lspi_jira_raw['Summary'].apply(simple_clean_text)
lspi_jira_raw['Cleaned_Description'] = lspi_jira_raw['Description'].apply(simple_clean_text)
lspi_jira_raw['Combined_Text'] = (lspi_jira_raw['Cleaned_Summary'] + ' ' + lspi_jira_raw['Cleaned_Description']).str.strip()

# --- 3. Consolidate Label Columns ---
print("Consolidating label columns...")
# Dynamically find all columns that start with 'Labels'
label_cols = [col for col in lspi_jira_raw.columns if col.startswith('Labels')]

if label_cols:
    # Apply a function row-wise to combine the labels
    lspi_jira_raw['Consolidated_Labels'] = lspi_jira_raw.apply(
        lambda row: ', '.join(row[label_cols].dropna().astype(str).unique()),
        axis=1
    )
    # Replace any empty strings in the new column with 'no_label'
    lspi_jira_raw.loc[lspi_jira_raw['Consolidated_Labels'] == '', 'Consolidated_Labels'] = 'no_label'
    print("Label consolidation complete.")
else:
    print("Warning: No columns starting with 'Labels' were found. Skipping label consolidation.")
    # Create the column with 'no_label' for all rows so the next step doesn't fail
    lspi_jira_raw['Consolidated_Labels'] = 'no_label'


# --- 4. Filter, Chunk, and Save 'no_label' Data to Files ---
print("Exporting 'no_label' data to files...")

# Step 4.1: Filter for 'no_label' rows and get the text
no_label_df = lspi_jira_raw[lspi_jira_raw['Consolidated_Labels'] == 'no_label']
texts_to_save = no_label_df['Combined_Text']

# Step 4.2: Check if there is anything to save
if not texts_to_save.empty:
    # Step 4.3: Calculate chunk size
    num_files = 5
    total_rows = len(texts_to_save)
    # Use math.ceil to ensure all rows are included, even with remainders
    chunk_size = math.ceil(total_rows / num_files)
    print(f"Found {total_rows} rows with 'no_label'. Splitting into up to {num_files} files with max {chunk_size} rows each.")

    # Step 4.4: Loop to create each file
    for i in range(num_files):
        # Define the start and end index for the chunk
        start_index = i * chunk_size
        end_index = start_index + chunk_size
        
        # Get the chunk of text
        chunk = texts_to_save.iloc[start_index:end_index]
        
        # If the chunk is empty (e.g., we have fewer rows than chunks), stop.
        if chunk.empty:
            break
            
        # Define the filename
        filename = f"no_label_chunk_{i+1}.txt"
        
        # Write the chunk to a text file
        with open(filename, 'w', encoding='utf-8') as f:
            # Join all text entries in the chunk with two newlines for separation
            file_content = '\n\n'.join(chunk)
            f.write(file_content)
            
        print(f"Successfully saved {len(chunk)} rows to {filename}")

else:
    print("No rows with 'no_label' were found to export.")

Cleaning and combining 'Summary' and 'Description' columns...
Consolidating label columns...
Label consolidation complete.
Exporting 'no_label' data to files...
Found 1024 rows with 'no_label'. Splitting into up to 5 files with max 205 rows each.
Successfully saved 205 rows to no_label_chunk_1.txt
Successfully saved 205 rows to no_label_chunk_2.txt
Successfully saved 205 rows to no_label_chunk_3.txt
Successfully saved 205 rows to no_label_chunk_4.txt
Successfully saved 204 rows to no_label_chunk_5.txt


In [42]:
# combine labeled ticket csvs

files = [
    'jira-tickets-labeled/ticket_labels.csv',
    'jira-tickets-labeled/ticket_labels_2.csv',
    'jira-tickets-labeled/ticket_labels_3.csv',
    'jira-tickets-labeled/ticket_labels_4.csv',
    'jira-tickets-labeled/ticket_labels_5.csv'
]

master_df = pd.read_csv(files[0])
master_cols = list(master_df.columns)

dfs = [master_df]

for f in files[1:]:
    df = pd.read_csv(f)
    if list(df.columns) != master_cols:
        raise ValueError(f"Column mismatch in {f}: {df.columns.tolist()} vs {master_cols}")
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

combined_df.to_csv('combined.csv', index=False)
print(f"Combined {len(files)} files into combined.csv with {len(combined_df)} rows.")


Combined 5 files into combined.csv with 1024 rows.
