# Project 02: Data Processing - Advanced Data Manipulation and Transformation

This notebook demonstrates advanced data processing techniques including:
- Automatic delimiter detection for CSV files
- Custom scaling and encoding strategies
- Data type conversions and categorical handling
- Regular expressions for data extraction
- One-hot encoding for categorical variables

## Dataset
The project uses `proj2_data.csv` with custom formatting and a scaling reference file `proj2_scale.txt`.

## 1. Import Required Libraries

In [1]:
import pandas as pd
import csv
import re
from pathlib import Path
import numpy as np

# Set up paths
DATA_PATH = Path('data')
OUTPUT_PATH = Path('output')
OUTPUT_PATH.mkdir(exist_ok=True)

print("Libraries imported successfully")

Libraries imported successfully


## 2. Smart CSV Loading with Automatic Delimiter Detection

In [None]:
def detect_csv_delimiter(filepath, possible_delimiters=['|', ';', ',', '\t']):
   
    sniffer = csv.Sniffer()
    sniffer.delimiters = possible_delimiters
    
    # Read a sample of the file to detect delimiter
    with open(filepath, 'r', encoding='utf-8') as f:
        sample = f.read(1024)
    
    detected_delimiter = sniffer.sniff(sample).delimiter
    return detected_delimiter

# Load the main dataset with automatic delimiter detection
csv_file = DATA_PATH / 'proj2_data.csv'

if csv_file.exists():
    delimiter = detect_csv_delimiter(csv_file)
    print(f"Detected delimiter: '{delimiter}'")
    
    # Load with detected delimiter and European decimal format
    df_original = pd.read_csv(csv_file, delimiter=delimiter, decimal=',')
    
    print(f"Dataset loaded successfully with shape: {df_original.shape}")
    print(f"Columns: {list(df_original.columns)}")
    
    # Save original processing result
    df_original.to_pickle(OUTPUT_PATH / 'original_data.pkl')
else:
    print(f"Warning: {csv_file} not found. Creating sample data for demonstration.")
    # Create sample data for demonstration
    df_original = pd.DataFrame({
        'category_a': ['low', 'medium', 'high', 'low', 'medium'],
        'category_b': ['poor', 'fair', 'good', 'excellent', 'good'],
        'numeric_text': ['10,5', '20,3', '15,7', '8,2', '12,9'],
        'mixed_col': ['alpha', 'beta', 'gamma', 'alpha', 'beta']
    })
    print("Created sample dataset for demonstration")

Detected delimiter: '|'
Dataset loaded successfully with shape: (12, 11)
Columns: ['full_name', 'field', 'language', 'code', 'task_1', 'task_2', 'task_3', 'tasks_avg', 'task_grade', 'jury_score', 'final_grade']


In [3]:
# Display the loaded data
print("Original Dataset:")
print(df_original.head())
print("\nData Types:")
print(df_original.dtypes)

Original Dataset:
          full_name   field   language code  task_1  task_2  task_3  \
0  Rowan Harrington  drones     python  wej     3.1     2.0     4.4   
1        Nash Wyatt  racing       java  sfe     4.2     2.0     2.0   
2    Jadiel Ramirez   media  cplusplus  vaw     4.0     4.9     3.0   
3    Makaila Atkins  racing      swift  ugt     4.1     5.0     4.8   
4    Melanie Fuller  racing     python  owb     2.7     2.0     2.0   

   tasks_avg    task_grade jury_score  final_grade  
0   3.166667   dostateczny    3,5 pts  dostateczny  
1   2.733333  bardzo dobry        5 p       mierny  
2   3.966667         dobry        3.5       mierny  
3   4.633333         dobry          2  dostateczny  
4   2.233333  bardzo dobry      pts 2       mierny  

Data Types:
full_name       object
field           object
language        object
code            object
task_1         float64
task_2         float64
task_3         float64
tasks_avg      float64
task_grade      object
jury_score      o

## 3. Custom Scaling System Implementation

In [None]:
def load_custom_scale(scale_file_path):
    
    scale_mapping = {}
    
    try:
        with open(scale_file_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f, 1):
                category = line.strip()
                if category:  # Skip empty lines
                    scale_mapping[category] = i
    except FileNotFoundError:
        print(f"Scale file {scale_file_path} not found. Creating default scale.")
        # Create default ordinal scale for common categories
        default_categories = ['poor', 'fair', 'good', 'excellent']
        scale_mapping = {cat: i+1 for i, cat in enumerate(default_categories)}
    
    return scale_mapping

# Load the scaling system
scale_file = DATA_PATH / 'proj2_scale.txt'
custom_scale = load_custom_scale(scale_file)

print("Custom Scale Mapping:")
for category, value in custom_scale.items():
    print(f"  {category} -> {value}")

Custom Scale Mapping:
  niedostateczny -> 1
  mierny -> 2
  dostateczny -> 3
  dobry -> 4
  bardzo dobry -> 5


In [None]:
def apply_custom_scaling(dataframe, scale_mapping):
    
    df_scaled = dataframe.copy()
    scaled_columns = []
    
    for col in df_scaled.columns:
        # Check if any values in this column exist in our scale mapping
        if df_scaled[col].dtype == 'object':
            unique_values = set(df_scaled[col].dropna().unique())
            scale_keys = set(scale_mapping.keys())
            
            if unique_values.intersection(scale_keys):
                # Apply scaling to this column
                df_scaled[col] = df_scaled[col].map(scale_mapping).fillna(df_scaled[col])
                scaled_columns.append(col)
                print(f"Applied scaling to column: {col}")
    
    return df_scaled, scaled_columns

# Apply custom scaling
df_scaled, scaled_cols = apply_custom_scaling(df_original, custom_scale)

print(f"\nColumns scaled: {scaled_cols}")
print("\nScaled Dataset:")
print(df_scaled.head())

# Save scaled data
df_scaled.to_pickle(OUTPUT_PATH / 'custom_scaled_data.pkl')

Applied scaling to column: task_grade
Applied scaling to column: final_grade

Columns scaled: ['task_grade', 'final_grade']

Scaled Dataset:
          full_name   field   language code  task_1  task_2  task_3  \
0  Rowan Harrington  drones     python  wej     3.1     2.0     4.4   
1        Nash Wyatt  racing       java  sfe     4.2     2.0     2.0   
2    Jadiel Ramirez   media  cplusplus  vaw     4.0     4.9     3.0   
3    Makaila Atkins  racing      swift  ugt     4.1     5.0     4.8   
4    Melanie Fuller  racing     python  owb     2.7     2.0     2.0   

   tasks_avg  task_grade jury_score  final_grade  
0   3.166667           3    3,5 pts            3  
1   2.733333           5        5 p            2  
2   3.966667           4        3.5            2  
3   4.633333           4          2            3  
4   2.233333           5      pts 2            2  


## 4. Advanced Categorical Data Handling

In [None]:
def create_categorical_columns(dataframe, scale_mapping):
    
    df_categorical = dataframe.copy()
    categorical_columns = []
    
    for col in df_categorical.columns:
        if df_categorical[col].dtype == 'object':
            unique_values = set(df_categorical[col].dropna().unique())
            scale_keys = set(scale_mapping.keys())
            
            # If column contains values from our scale, make it ordinal categorical
            if unique_values.intersection(scale_keys):
                # Sort categories by their scale values
                relevant_categories = [cat for cat in scale_mapping.keys() 
                                     if cat in unique_values]
                sorted_categories = sorted(relevant_categories, 
                                         key=lambda x: scale_mapping[x])
                
                df_categorical[col] = pd.Categorical(
                    df_categorical[col], 
                    categories=sorted_categories,
                    ordered=True
                )
                categorical_columns.append(col)
                print(f"Created ordered categorical for {col}: {sorted_categories}")
            
            # For other object columns with few unique values, make them nominal categorical
            elif df_categorical[col].nunique() <= 10:
                df_categorical[col] = pd.Categorical(df_categorical[col])
                categorical_columns.append(col)
                print(f"Created nominal categorical for {col}")
    
    return df_categorical, categorical_columns

# Create categorical columns
df_categorical, cat_cols = create_categorical_columns(df_original, custom_scale)

print(f"\nCategorical columns created: {cat_cols}")
print("\nData types after categorical conversion:")
print(df_categorical.dtypes)

# Save categorical data
df_categorical.to_pickle(OUTPUT_PATH / 'categorical_data.pkl')

Created nominal categorical for field
Created nominal categorical for language
Created ordered categorical for task_grade: ['dostateczny', 'dobry', 'bardzo dobry']
Created ordered categorical for final_grade: ['niedostateczny', 'mierny', 'dostateczny']

Categorical columns created: ['field', 'language', 'task_grade', 'final_grade']

Data types after categorical conversion:
full_name        object
field          category
language       category
code             object
task_1          float64
task_2          float64
task_3          float64
tasks_avg       float64
task_grade     category
jury_score       object
final_grade    category
dtype: object


## 5. Number Extraction from Text Using Regular Expressions

In [None]:
def extract_numbers_from_text(text_value):
    
    if pd.isna(text_value) or not isinstance(text_value, str):
        return text_value
    
    # Pattern to match numbers with optional decimal part (comma or dot)
    number_pattern = r'[-+]?\d*[,.]?\d+(?:[,.]\d+)?'
    
    numbers = re.findall(number_pattern, str(text_value))
    
    if numbers:
        # Take the first number found and normalize decimal separator
        number_str = numbers[0].replace(',', '.')
        try:
            return float(number_str)
        except ValueError:
            return None
    
    return None

def convert_text_to_numeric(dataframe):
    
    df_numeric = dataframe.copy()
    converted_columns = []
    
    # Process non-numeric columns
    for col in df_numeric.select_dtypes(exclude=['number']).columns:
        # Apply number extraction
        extracted_values = df_numeric[col].apply(extract_numbers_from_text)
        
        # Check if extraction was successful (found numeric values)
        non_null_extracted = extracted_values.dropna()
        if len(non_null_extracted) > 0 and non_null_extracted.apply(lambda x: isinstance(x, (int, float))).any():
            df_numeric[f'{col}_numeric'] = extracted_values
            converted_columns.append(f'{col}_numeric')
            print(f"Extracted numbers from {col} -> {col}_numeric")
    
    # Return only the successfully converted numeric columns
    if converted_columns:
        return df_numeric[converted_columns], converted_columns
    else:
        return pd.DataFrame(), []

# Extract numbers from text columns
df_extracted, extracted_cols = convert_text_to_numeric(df_original)

if not df_extracted.empty:
    print(f"\nColumns with extracted numbers: {extracted_cols}")
    print("\nExtracted numeric data:")
    print(df_extracted.head())
    print("\nData types of extracted columns:")
    print(df_extracted.dtypes)
    
    # Save extracted numeric data
    df_extracted.to_pickle(OUTPUT_PATH / 'extracted_numeric_data.pkl')
else:
    print("No numeric values were successfully extracted from text columns.")

Extracted numbers from jury_score -> jury_score_numeric

Columns with extracted numbers: ['jury_score_numeric']

Extracted numeric data:
   jury_score_numeric
0                 3.5
1                 5.0
2                 3.5
3                 2.0
4                 2.0

Data types of extracted columns:
jury_score_numeric    float64
dtype: object


## 6. Smart One-Hot Encoding for Categorical Variables

In [None]:
def identify_encoding_candidates(dataframe, scale_mapping, max_categories=10):
    
    encoding_candidates = []
    scale_values = set(scale_mapping.keys())
    
    for col in dataframe.columns:
        if dataframe[col].dtype == 'object':
            unique_values = set(dataframe[col].dropna().unique())
            
            # Check if column is suitable for encoding
            conditions = [
                len(unique_values) <= max_categories,  # Not too many categories
                len(unique_values) > 1,  # Has variation
                not unique_values.intersection(scale_values),  # Not in custom scale
                all(isinstance(val, str) and val.isalpha() for val in unique_values)  # Text only
            ]
            
            if all(conditions):
                encoding_candidates.append(col)
                print(f"Column '{col}' identified for one-hot encoding")
                print(f"  Unique values: {sorted(unique_values)}")
    
    return encoding_candidates

def create_one_hot_encodings(dataframe, columns_to_encode):
    
    encoded_results = {}
    
    for col in columns_to_encode:
        # Create one-hot encoding
        encoded_df = pd.get_dummies(
            dataframe[col], 
            prefix=col,
            prefix_sep='_',
            dummy_na=False
        )
        
        encoded_results[col] = encoded_df
        
        print(f"\nOne-hot encoding for '{col}':")
        print(f"  Original shape: {dataframe[col].shape}")
        print(f"  Encoded shape: {encoded_df.shape}")
        print(f"  New columns: {list(encoded_df.columns)}")
        
        # Save individual encoded DataFrame
        encoded_df.to_pickle(OUTPUT_PATH / f'one_hot_{col}.pkl')
    
    return encoded_results

# Identify and encode categorical columns
columns_to_encode = identify_encoding_candidates(df_original, custom_scale)

if columns_to_encode:
    print(f"\nColumns selected for one-hot encoding: {columns_to_encode}")
    
    # Create one-hot encodings
    encoded_dataframes = create_one_hot_encodings(df_original, columns_to_encode)
    
    # Display results
    for col, encoded_df in encoded_dataframes.items():
        print(f"\nEncoded DataFrame for '{col}':")
        print(encoded_df.head())
else:
    print("No suitable columns found for one-hot encoding.")

Column 'field' identified for one-hot encoding
  Unique values: ['drones', 'media', 'racing', 'robotics']
Column 'language' identified for one-hot encoding
  Unique values: ['cplusplus', 'java', 'python', 'swift']

Columns selected for one-hot encoding: ['field', 'language']

One-hot encoding for 'field':
  Original shape: (12,)
  Encoded shape: (12, 4)
  New columns: ['field_drones', 'field_media', 'field_racing', 'field_robotics']

One-hot encoding for 'language':
  Original shape: (12,)
  Encoded shape: (12, 4)
  New columns: ['language_cplusplus', 'language_java', 'language_python', 'language_swift']

Encoded DataFrame for 'field':
   field_drones  field_media  field_racing  field_robotics
0          True        False         False           False
1         False        False          True           False
2         False         True         False           False
3         False        False          True           False
4         False        False          True           False

E

## 7. Comprehensive Data Processing Pipeline

In [None]:
def comprehensive_data_processing(dataframe, scale_mapping):
    
    processing_results = {
        'original': dataframe.copy()
    }
    
    # Step 1: Apply custom scaling
    print("1. Applying custom scaling...")
    scaled_df, _ = apply_custom_scaling(dataframe, scale_mapping)
    processing_results['scaled'] = scaled_df
    
    # Step 2: Create categorical versions
    print("\n2. Creating categorical columns...")
    categorical_df, _ = create_categorical_columns(dataframe, scale_mapping)
    processing_results['categorical'] = categorical_df
    
    # Step 3: Extract numeric values
    print("\n3. Extracting numeric values from text...")
    numeric_df, _ = convert_text_to_numeric(dataframe)
    if not numeric_df.empty:
        processing_results['extracted_numeric'] = numeric_df
    
    # Step 4: One-hot encoding
    print("\n4. Creating one-hot encodings...")
    encoding_candidates = identify_encoding_candidates(dataframe, scale_mapping)
    if encoding_candidates:
        encoded_dfs = create_one_hot_encodings(dataframe, encoding_candidates)
        processing_results['one_hot'] = encoded_dfs
    
    return processing_results

# Run comprehensive processing
all_results = comprehensive_data_processing(df_original, custom_scale)

# Summary of results
print("\n" + "="*50)
print("PROCESSING PIPELINE SUMMARY")
print("="*50)

for step_name, result in all_results.items():
    if isinstance(result, pd.DataFrame):
        print(f"{step_name.upper()}:")
        print(f"  Shape: {result.shape}")
        print(f"  Columns: {list(result.columns)}")
        print(f"  Data types: {dict(result.dtypes)}")
    elif isinstance(result, dict):
        print(f"{step_name.upper()}:")
        for sub_name, sub_df in result.items():
            print(f"  {sub_name}: {sub_df.shape} - {list(sub_df.columns)}")
    print()

Starting comprehensive data processing pipeline...

1. Applying custom scaling...
Applied scaling to column: task_grade
Applied scaling to column: final_grade

2. Creating categorical columns...
Created nominal categorical for field
Created nominal categorical for language
Created ordered categorical for task_grade: ['dostateczny', 'dobry', 'bardzo dobry']
Created ordered categorical for final_grade: ['niedostateczny', 'mierny', 'dostateczny']

3. Extracting numeric values from text...
Extracted numbers from jury_score -> jury_score_numeric

4. Creating one-hot encodings...
Column 'field' identified for one-hot encoding
  Unique values: ['drones', 'media', 'racing', 'robotics']
Column 'language' identified for one-hot encoding
  Unique values: ['cplusplus', 'java', 'python', 'swift']

One-hot encoding for 'field':
  Original shape: (12,)
  Encoded shape: (12, 4)
  New columns: ['field_drones', 'field_media', 'field_racing', 'field_robotics']

One-hot encoding for 'language':
  Original

## 8. Data Quality Assessment and Validation

In [None]:
def assess_data_quality(processing_results):
    
    quality_report = {}
    
    original_df = processing_results['original']
    original_rows = len(original_df)
    
    print("DATA QUALITY ASSESSMENT")
    print("="*40)
    
    for step_name, result in processing_results.items():
        if isinstance(result, pd.DataFrame) and step_name != 'one_hot':
            # Calculate quality metrics
            completeness = (1 - result.isnull().mean().mean()) * 100
            data_retention = (len(result) / original_rows) * 100
            
            quality_metrics = {
                'rows': len(result),
                'columns': len(result.columns),
                'completeness_pct': round(completeness, 2),
                'data_retention_pct': round(data_retention, 2),
                'memory_usage_mb': round(result.memory_usage(deep=True).sum() / 1024**2, 2)
            }
            
            quality_report[step_name] = quality_metrics
            
            print(f"\n{step_name.upper()}:")
            for metric, value in quality_metrics.items():
                print(f"  {metric}: {value}")
    
    return quality_report

# Assess data quality
quality_assessment = assess_data_quality(all_results)

# Save quality report
import json
with open(OUTPUT_PATH / 'quality_assessment.json', 'w') as f:
    json.dump(quality_assessment, f, indent=2)

print("\nQuality assessment saved to output/quality_assessment.json")

DATA QUALITY ASSESSMENT

ORIGINAL:
  rows: 12
  columns: 11
  completeness_pct: 100.0
  data_retention_pct: 100.0
  memory_usage_mb: 0.01

SCALED:
  rows: 12
  columns: 11
  completeness_pct: 100.0
  data_retention_pct: 100.0
  memory_usage_mb: 0.0

CATEGORICAL:
  rows: 12
  columns: 11
  completeness_pct: 100.0
  data_retention_pct: 100.0
  memory_usage_mb: 0.0

EXTRACTED_NUMERIC:
  rows: 12
  columns: 1
  completeness_pct: 83.33
  data_retention_pct: 100.0
  memory_usage_mb: 0.0

Quality assessment saved to output/quality_assessment.json
