In [53]:
import pandas as pd
import matplotlib.pyplot as plt

In [60]:
# set the display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [61]:
# read the csv file
data = pd.read_csv('Students.csv', encoding='latin-1')

In [62]:
# data shape
print(f"Data shape: {data.shape}")

Data shape: (3614, 16)


In [63]:
# Check duplicates in original data
print("=== DUPLICATE ANALYSIS ===")
duplicates_original = data.duplicated().sum()
print(f"Original data duplicates (including all columns): {duplicates_original}")

# Check duplicates after dropping identifying columns
identifying_cols = ['Student_Name', 'State', 'College_Name', 'Stream']
existing_identifying_cols = [col for col in identifying_cols if col in data.columns]
data_without_ids = data.drop(columns=existing_identifying_cols)
duplicates_after_drop = data_without_ids.duplicated().sum()

print(f"Dropped identifying columns: {existing_identifying_cols}")
print(f"Duplicates after dropping identifying columns: {duplicates_after_drop}")

if duplicates_after_drop > 0:
    print(f"✅ {duplicates_after_drop} students have identical responses across all features")
    print("   This is normal - multiple students can have the same usage patterns")
else:
    print("✅ No duplicate response patterns found")

# Optional: Show some examples of identical response patterns
if duplicates_after_drop > 0:
    print("\n📊 Sample of identical response patterns:")
    duplicate_mask = data_without_ids.duplicated(keep=False)
    sample_duplicates = data_without_ids[duplicate_mask].head(6)
    print(f"Showing {len(sample_duplicates)} rows with identical patterns...")
    print(sample_duplicates[['Year_of_Study', 'Daily_Usage_Hours', 'Trust_in_AI_Tools', 'Impact_on_Grades']].to_string())

=== DUPLICATE ANALYSIS ===
Original data duplicates (including all columns): 0
Dropped identifying columns: ['Student_Name', 'State', 'College_Name', 'Stream']
Duplicates after dropping identifying columns: 580
✅ 580 students have identical responses across all features
   This is normal - multiple students can have the same usage patterns

📊 Sample of identical response patterns:
Showing 6 rows with identical patterns...
    Year_of_Study  Daily_Usage_Hours  Trust_in_AI_Tools  Impact_on_Grades
0               4                0.9                  2                 2
4               1                0.9                  1                 3
9               2                1.4                  3                -2
19              1                4.4                  1                 2
22              1                3.4                  1                 0
32              2                0.7                  3                 0


In [64]:
# Drop identifying columns while keeping identical response patterns
# Note: We keep rows with identical responses because they represent different students
# with the same usage patterns, which is valuable data for modeling
data_clean = data.drop(columns=['State', 'Student_Name'])

# Use Cases

In [65]:
# create a new column for Use_Cases split by a comma 
data_clean['Use_Cases_Split'] = data_clean['Use_Cases'].str.split(',')

In [66]:
import re
from collections import Counter

# First, let's extract all individual use cases from the split data
def extract_all_individual_use_cases(df, column='Use_Cases_Split'):
    """
    Extract all individual use cases from the split column
    """
    all_individual_cases = []
    for case_list in df[column].dropna():
        for case in case_list:
            if case.strip():  # Only add non-empty cases
                all_individual_cases.append(case.strip().lower())
    return all_individual_cases

# Get all individual use cases
all_cases = extract_all_individual_use_cases(data_clean)
case_counts = Counter(all_cases)

print("All individual use cases and their frequencies:")
for case, count in case_counts.most_common():
    print(f"'{case}': {count}")

All individual use cases and their frequencies:
'coding help': 766
'assignments': 749
'mcq practice': 692
'doubt solving': 638
'resume writing': 617
'content writing': 602
'learning new topics': 594
'projects': 470
'exam prep': 451
'notes': 451
'project work': 229
'exam preparation': 202


In [67]:
def standardize_individual_use_cases(df):
    """
    Standardize individual use cases using the mapping
    """
    def apply_mapping_to_list(case_list):
        # Handle None/NaN cases
        if case_list is None or (isinstance(case_list, float) and pd.isna(case_list)):
            return case_list
        
        # Handle empty list
        if not isinstance(case_list, list) or len(case_list) == 0:
            return case_list
        
        standardized_cases = []
        for case in case_list:
            if case and isinstance(case, str):  # Check if case is not None and is string
                # Clean and normalize the case
                cleaned_case = case.strip().lower()
                # Apply mapping
                standardized_case = use_case_mapping.get(cleaned_case, cleaned_case)
                # Avoid duplicates
                if standardized_case not in standardized_cases:
                    standardized_cases.append(standardized_case)
        
        return standardized_cases
    
    return df['Use_Cases_Split'].apply(apply_mapping_to_list)

# Apply standardization
data_clean['Use_Cases_Split_Standardized'] = standardize_individual_use_cases(data_clean)

# Convert back to string format for easier viewing
data_clean['Use_Cases_Standardized'] = data_clean['Use_Cases_Split_Standardized'].apply(
    lambda x: ', '.join([case.title() for case in x]) if x and isinstance(x, list) else ''
)

In [68]:
# Create binary features for each use case
def create_binary_use_case_features(df):
    """
    Create binary columns for each unique use case
    """
    # Get all unique use cases from standardized data
    all_use_cases = ['coding help', 'assignments', 'project work', 'mcq practice', 
                     'exam preparation', 'doubt solving', 'resume writing', 
                     'content writing', 'learning new topics', 'notes']
    
    # Create binary columns
    for use_case in all_use_cases:
        col_name = f'uses_{use_case.replace(" ", "_")}'
        df[col_name] = df['Use_Cases_Split_Standardized'].apply(
            lambda x: 1 if x and use_case in x else 0
        )
    
    return df

# Apply the function
data_clean = create_binary_use_case_features(data_clean)

# Show the new binary features
binary_features = [col for col in data_clean.columns if col.startswith('uses_')]
print("Binary use case features:")
for feature in binary_features:
    print(f"{feature}: {data_clean[feature].sum()} students")

Binary use case features:
uses_coding_help: 766 students
uses_assignments: 749 students
uses_project_work: 699 students
uses_mcq_practice: 692 students
uses_exam_preparation: 653 students
uses_doubt_solving: 638 students
uses_resume_writing: 617 students
uses_content_writing: 602 students
uses_learning_new_topics: 594 students
uses_notes: 451 students


In [30]:
# top 5
# data_clean.head()

# AI_Tools_Used

In [69]:
def create_ai_tools_dummy_variables(df, column_name='AI_Tools_Used'):
    """
    Create dummy variables for comma-separated AI tools column
    """
    print("=== CREATING DUMMY VARIABLES FOR AI TOOLS ===")
    
    # Method 1: Using pandas get_dummies with separator
    print("Method 1: Using pandas get_dummies")
    
    # Create dummy variables directly from comma-separated values
    # Convert to int to get 1/0 instead of True/False
    dummies = df[column_name].str.get_dummies(sep=', ').astype(int)
    
    # Add prefix to column names for clarity
    dummies.columns = [f'ai_tool_{col.lower().replace(" ", "_")}' for col in dummies.columns]
    
    print(f"Created {len(dummies.columns)} dummy variables:")
    for col in dummies.columns:
        print(f"  • {col}: {dummies[col].sum()} students use this tool")
    
    # Show the dummy variables
    print(f"\nDummy variables shape: {dummies.shape}")
    print(f"Sample of dummy variables:")
    print(dummies.head())
    
    # Verify data types
    print(f"\nData types:")
    print(f"All columns are integers: {all(dummies[col].dtype in ['int64', 'int32'] for col in dummies.columns)}")
    
    return dummies

# Apply the AI Tools dummy variables function to data_clean
ai_tools_dummies_clean = create_ai_tools_dummy_variables(data_clean, 'AI_Tools_Used') 
# Apply to your data
data_clean_v1 = pd.concat([data_clean, ai_tools_dummies_clean], axis=1)

=== CREATING DUMMY VARIABLES FOR AI TOOLS ===
Method 1: Using pandas get_dummies
Created 7 dummy variables:
  • ai_tool_bard: 151 students use this tool
  • ai_tool_chatgpt: 1557 students use this tool
  • ai_tool_claude: 171 students use this tool
  • ai_tool_copilot: 1516 students use this tool
  • ai_tool_gemini: 1409 students use this tool
  • ai_tool_midjourney: 371 students use this tool
  • ai_tool_other: 167 students use this tool

Dummy variables shape: (3614, 7)
Sample of dummy variables:
   ai_tool_bard  ai_tool_chatgpt  ai_tool_claude  ai_tool_copilot  \
0             0                0               0                0   
1             0                1               0                0   
2             0                0               0                1   
3             0                0               0                1   
4             0                0               0                0   

   ai_tool_gemini  ai_tool_midjourney  ai_tool_other  
0               1         

In [29]:
# top 5 of data clean v1
# data_clean_v1.head(5)

# Drop Necessary Columns

In [70]:
# column names of data clean 
data_clean_v1.columns

Index(['College_Name', 'Stream', 'Year_of_Study', 'AI_Tools_Used',
       'Daily_Usage_Hours', 'Use_Cases', 'Trust_in_AI_Tools',
       'Impact_on_Grades', 'Do_Professors_Allow_Use', 'Preferred_AI_Tool',
       'Awareness_Level', 'Willing_to_Pay_for_Access', 'Device_Used',
       'Internet_Access', 'Use_Cases_Split', 'Use_Cases_Split_Standardized',
       'Use_Cases_Standardized', 'uses_coding_help', 'uses_assignments',
       'uses_project_work', 'uses_mcq_practice', 'uses_exam_preparation',
       'uses_doubt_solving', 'uses_resume_writing', 'uses_content_writing',
       'uses_learning_new_topics', 'uses_notes', 'ai_tool_bard',
       'ai_tool_chatgpt', 'ai_tool_claude', 'ai_tool_copilot',
       'ai_tool_gemini', 'ai_tool_midjourney', 'ai_tool_other'],
      dtype='object')

In [71]:
# remove 'Student_Name', 'College_Name', 'Use_Cases', 'State', 'Use_Cases_Advanced', 'Use_Cases_Split','Use_Cases_Length', 'Use_Cases_Split_Standardized'
data_cleaned_final = data_clean_v1.drop(columns=['College_Name', 'Stream', 'AI_Tools_Used', 'Use_Cases', 'Use_Cases_Split', 'Use_Cases_Split_Standardized', 'Use_Cases_Standardized'])

In [43]:
# top 5 of data cleaned final
# data_cleaned_final.head(5)

# One Hot Encoding

In [72]:
def apply_appropriate_encoding(df):
    """
    Apply the correct encoding strategy for each categorical column
    """
    # Make a copy to avoid modifying original
    df = df.copy()
    
    # 1. ONE-HOT ENCODING for nominal categories
    nominal_columns = ['AI_Tools_Used', 'Preferred_AI_Tool', 'Device_Used', 'Internet_Access']
    
    for col in nominal_columns:
        if col in df.columns:
            # Create dummy variables with dtype=int to get 1/0 instead of True/False
            dummies = pd.get_dummies(df[col], prefix=col.lower(), drop_first=False, dtype=int)
            df = pd.concat([df, dummies], axis=1)
            print(f"One-hot encoded {col}: {len(dummies.columns)} features created")
    
    # 2. BINARY ENCODING for yes/no columns only
    binary_mappings = {
        'Do_Professors_Allow_Use': {'No': 0, 'Yes': 1},
        'Willing_to_Pay_for_Access': {'No': 0, 'Yes': 1}
    }
    
    for col, mapping in binary_mappings.items():
        if col in df.columns:
            df[f'{col}_encoded'] = df[col].map(mapping)
            print(f"Binary encoded {col}: 0/1")
    
    # 3. Skip Trust_in_AI_Tools and Impact_on_Grades if they're already numerical
    print(f"\nSkipping Trust_in_AI_Tools and Impact_on_Grades - already numerical")
    print(f"Trust_in_AI_Tools type: {df['Trust_in_AI_Tools'].dtype}")
    print(f"Impact_on_Grades type: {df['Impact_on_Grades'].dtype}")
    
    return df

# Apply encoding strategy
data_encoded = apply_appropriate_encoding(data_cleaned_final)

# Show the results
print("\nNew encoded columns created:")
encoded_cols = [col for col in data_encoded.columns if any(x in col for x in ['_encoded', 'stream_', 'ai_tools_used_', 'preferred_ai_tool_', 'device_used_', 'internet_access_'])]
print(encoded_cols)

# Display sample of encoded data
print("\nSample of encoded data:")
print(data_encoded[['Trust_in_AI_Tools', 'Impact_on_Grades'] + encoded_cols[:5]].head())

# Verify data types of encoded columns
print("\nData types of encoded columns:")
for col in encoded_cols[:5]:
    if col in data_encoded.columns:
        print(f"{col}: {data_encoded[col].dtype} | Unique values: {sorted(data_encoded[col].unique())}")

One-hot encoded Preferred_AI_Tool: 6 features created
One-hot encoded Device_Used: 3 features created
One-hot encoded Internet_Access: 3 features created
Binary encoded Do_Professors_Allow_Use: 0/1
Binary encoded Willing_to_Pay_for_Access: 0/1

Skipping Trust_in_AI_Tools and Impact_on_Grades - already numerical
Trust_in_AI_Tools type: int64
Impact_on_Grades type: int64

New encoded columns created:
['preferred_ai_tool_Bard', 'preferred_ai_tool_ChatGPT', 'preferred_ai_tool_Claude', 'preferred_ai_tool_Copilot', 'preferred_ai_tool_Gemini', 'preferred_ai_tool_Other', 'device_used_Laptop', 'device_used_Mobile', 'device_used_Tablet', 'internet_access_High', 'internet_access_Medium', 'internet_access_Poor', 'Do_Professors_Allow_Use_encoded', 'Willing_to_Pay_for_Access_encoded']

Sample of encoded data:
   Trust_in_AI_Tools  Impact_on_Grades  preferred_ai_tool_Bard  \
0                  2                 2                       0   
1                  3                -3                       

# Drop Unnecessary Columns

In [73]:
# Drop unnecessary columns
data_encoded_clean = data_encoded.drop(columns=[
    'Do_Professors_Allow_Use', 'Preferred_AI_Tool',
    'Willing_to_Pay_for_Access', 'Device_Used',
       'Internet_Access'
])

In [74]:
# data shape of encoded clean data
print(f"\nFinal data shape after encoding: {data_encoded_clean.shape}")


Final data shape after encoding: (3614, 36)


In [75]:
# save it as a csv file
data_encoded_clean.to_csv('Students_Cleaned_Encoded_v1.csv', index=False)

In [44]:
# top 5 of data encoded
# data_encoded.head(5)

In [48]:
# read the data from csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [49]:
data = pd.read_csv('Students_Cleaned_Encoded.csv')

In [3]:
# top 5
data.head()

Unnamed: 0,Year_of_Study,Daily_Usage_Hours,Trust_in_AI_Tools,Impact_on_Grades,Awareness_Level,uses_coding_help,uses_assignments,uses_project_work,uses_mcq_practice,uses_exam_preparation,...,preferred_ai_tool_Gemini,preferred_ai_tool_Other,device_used_Laptop,device_used_Mobile,device_used_Tablet,internet_access_High,internet_access_Medium,internet_access_Poor,Do_Professors_Allow_Use_encoded,Willing_to_Pay_for_Access_encoded
0,4,0.9,2,2,9,1,1,0,0,0,...,0,0,0,1,0,0,0,1,0,1
1,2,3.4,3,-3,6,0,0,0,0,0,...,0,1,1,0,0,0,0,1,1,0
2,2,3.6,5,0,1,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
3,2,2.9,5,2,5,0,0,0,0,0,...,1,0,1,0,0,1,0,0,1,0
4,1,0.9,1,3,8,0,0,0,0,0,...,0,1,1,0,0,0,1,0,1,1


In [19]:
# 1. Check for redundant/duplicate features
def analyze_feature_redundancy(data):
    """
    Identify potentially redundant features
    """
    print("=== FEATURE REDUNDANCY ANALYSIS ===")
    
    # Check for duplicate AI tool features
    ai_tool_features = [col for col in data.columns if col.startswith('ai_tool_')]
    preferred_tool_features = [col for col in data.columns if col.startswith('preferred_ai_tool_')]
    
    print(f"AI Tool Usage Features: {len(ai_tool_features)}")
    print(f"Preferred Tool Features: {len(preferred_tool_features)}")
    
    # Check correlation between similar features
    if 'ai_tool_chatgpt' in data.columns and 'preferred_ai_tool_ChatGPT' in data.columns:
        corr = data['ai_tool_chatgpt'].corr(data['preferred_ai_tool_ChatGPT'])
        print(f"ChatGPT usage vs preference correlation: {corr:.3f}")
    
    # Look for highly correlated features
    corr_matrix = data.corr()
    high_corr_pairs = []
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > 0.8:
                high_corr_pairs.append((
                    corr_matrix.columns[i], 
                    corr_matrix.columns[j], 
                    corr_matrix.iloc[i, j]
                ))
    
    print(f"\nHighly correlated features (>0.8):")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"  {feat1} ↔ {feat2}: {corr:.3f}")
    
    return high_corr_pairs

# Run redundancy analysis
redundant_features = analyze_feature_redundancy(data)

=== FEATURE REDUNDANCY ANALYSIS ===
AI Tool Usage Features: 7
Preferred Tool Features: 6
ChatGPT usage vs preference correlation: 0.020

Highly correlated features (>0.8):
  uses_exam_preparation ↔ uses_notes: 0.804


In [20]:
# 2. Create more meaningful feature engineering
def create_advanced_features(data):
    """
    Create more sophisticated features
    """
    print("=== ADVANCED FEATURE ENGINEERING ===")
    
    data_enhanced = data.copy()
    
    # 1. AI Tool Diversity Score
    ai_tool_cols = [col for col in data.columns if col.startswith('ai_tool_')]
    data_enhanced['ai_tool_diversity'] = data[ai_tool_cols].sum(axis=1)
    
    # 2. Use Case Diversity Score
    use_case_cols = [col for col in data.columns if col.startswith('uses_')]
    data_enhanced['use_case_diversity'] = data[use_case_cols].sum(axis=1)
    
    # 3. Technology Adoption Score (composite)
    tech_features = ['Trust_in_AI_Tools', 'Awareness_Level', 'ai_tool_diversity']
    data_enhanced['tech_adoption_score'] = data_enhanced[tech_features].sum(axis=1)
    
    # 4. Academic Focus Score
    academic_uses = ['uses_assignments', 'uses_exam_preparation', 'uses_project_work']
    data_enhanced['academic_focus'] = data_enhanced[academic_uses].sum(axis=1)
    
    # 5. Professional Focus Score
    professional_uses = ['uses_resume_writing', 'uses_content_writing', 'uses_coding_help']
    data_enhanced['professional_focus'] = data_enhanced[professional_uses].sum(axis=1)
    
    # 6. Device Quality Score
    if 'device_used_Laptop' in data.columns:
        data_enhanced['device_quality'] = (
            data_enhanced['device_used_Laptop'] * 3 +
            data_enhanced['device_used_Tablet'] * 2 +
            data_enhanced['device_used_Mobile'] * 1
        )
    
    # 7. Internet Quality Score
    if 'internet_access_High' in data.columns:
        data_enhanced['internet_quality'] = (
            data_enhanced['internet_access_High'] * 3 +
            data_enhanced['internet_access_Medium'] * 2 +
            data_enhanced['internet_access_Poor'] * 1
        )
    
    # 8. Interaction terms
    data_enhanced['trust_x_usage'] = data_enhanced['Trust_in_AI_Tools'] * data_enhanced['Daily_Usage_Hours']
    data_enhanced['year_x_usage'] = data_enhanced['Year_of_Study'] * data_enhanced['Daily_Usage_Hours']
    
    print(f"Created {len(data_enhanced.columns) - len(data.columns)} new features")
    
    new_features = [col for col in data_enhanced.columns if col not in data.columns]
    print(f"New features: {new_features}")
    
    return data_enhanced, new_features

# Apply advanced feature engineering
data_enhanced, new_features = create_advanced_features(data)

=== ADVANCED FEATURE ENGINEERING ===
Created 9 new features
New features: ['ai_tool_diversity', 'use_case_diversity', 'tech_adoption_score', 'academic_focus', 'professional_focus', 'device_quality', 'internet_quality', 'trust_x_usage', 'year_x_usage']


## 📊 Analysis of create_advanced_features Function

### ✅ **STRENGTHS:**
1. **Domain-driven features** - Creates meaningful composite scores (academic focus, professional focus)
2. **Diversity metrics** - Captures tool and use case diversity effectively
3. **Weighted scoring** - Device and internet quality use logical weights
4. **Interaction terms** - Captures relationships between variables
5. **Robust error handling** - Checks for column existence before creating features
6. **Clear documentation** - Well-commented and structured

### ⚠️ **AREAS FOR IMPROVEMENT:**
1. **Potential missing features** - Some features might not exist after correlation optimization
2. **Fixed weights** - Device/internet quality weights could be data-driven
3. **Limited interaction terms** - Could explore more meaningful interactions
4. **No normalization** - Composite scores might have different scales
5. **Missing validation** - No checks for feature quality or predictive power

### 🔧 **RECOMMENDATIONS:**
1. **Add feature existence validation** for all referenced columns
2. **Create data-driven weights** based on target correlation
3. **Add normalization** for composite scores
4. **Include more interaction terms** based on domain knowledge
5. **Add feature quality validation** after creation

In [76]:
# 🔧 IMPROVED VERSION OF create_advanced_features
def create_advanced_features_improved(data, target_col='Daily_Usage_Hours'):
    """
    Create more sophisticated and robust features with validation
    """
    print("=== IMPROVED ADVANCED FEATURE ENGINEERING ===")
    
    data_enhanced = data.copy()
    new_features_created = []
    
    # Helper function to safely create features
    def safe_feature_creation(feature_name, feature_func, description):
        try:
            data_enhanced[feature_name] = feature_func()
            new_features_created.append(feature_name)
            print(f"✅ {feature_name}: {description}")
            return True
        except Exception as e:
            print(f"❌ {feature_name}: Failed - {str(e)}")
            return False
    
    # 1. AI Tool Diversity Score (improved)
    ai_tool_cols = [col for col in data.columns if col.startswith('ai_tool_')]
    if ai_tool_cols:
        safe_feature_creation(
            'ai_tool_diversity',
            lambda: data[ai_tool_cols].sum(axis=1),
            f"Sum of {len(ai_tool_cols)} AI tools used"
        )
        
        # Normalized diversity score
        safe_feature_creation(
            'ai_tool_diversity_normalized',
            lambda: data[ai_tool_cols].sum(axis=1) / len(ai_tool_cols),
            "Normalized AI tool diversity (0-1)"
        )
    
    # 2. Use Case Diversity Score (improved)
    use_case_cols = [col for col in data.columns if col.startswith('uses_')]
    if use_case_cols:
        safe_feature_creation(
            'use_case_diversity',
            lambda: data[use_case_cols].sum(axis=1),
            f"Sum of {len(use_case_cols)} use cases"
        )
        
        safe_feature_creation(
            'use_case_diversity_normalized',
            lambda: data[use_case_cols].sum(axis=1) / len(use_case_cols),
            "Normalized use case diversity (0-1)"
        )
    
    # 3. Technology Adoption Score (improved with validation)
    tech_base_features = ['Trust_in_AI_Tools', 'Awareness_Level']
    available_tech_features = [f for f in tech_base_features if f in data.columns]
    
    if available_tech_features:
        if 'ai_tool_diversity' in data_enhanced.columns:
            available_tech_features.append('ai_tool_diversity')
        
        safe_feature_creation(
            'tech_adoption_score',
            lambda: data_enhanced[available_tech_features].sum(axis=1),
            f"Composite score from {len(available_tech_features)} features"
        )
        
        # Normalized version
        safe_feature_creation(
            'tech_adoption_score_normalized',
            lambda: (data_enhanced[available_tech_features].sum(axis=1) / 
                    data_enhanced[available_tech_features].sum(axis=1).max()),
            "Normalized technology adoption score (0-1)"
        )
    
    # 4. Academic Focus Score (improved with validation)
    academic_uses = ['uses_assignments', 'uses_exam_preparation', 'uses_project_work']
    available_academic = [f for f in academic_uses if f in data.columns]
    
    if available_academic:
        safe_feature_creation(
            'academic_focus',
            lambda: data_enhanced[available_academic].sum(axis=1),
            f"Academic focus from {len(available_academic)} features"
        )
        
        safe_feature_creation(
            'academic_focus_normalized',
            lambda: data_enhanced[available_academic].sum(axis=1) / len(available_academic),
            "Normalized academic focus (0-1)"
        )
    
    # 5. Professional Focus Score (improved with validation)
    professional_uses = ['uses_resume_writing', 'uses_content_writing', 'uses_coding_help']
    available_professional = [f for f in professional_uses if f in data.columns]
    
    if available_professional:
        safe_feature_creation(
            'professional_focus',
            lambda: data_enhanced[available_professional].sum(axis=1),
            f"Professional focus from {len(available_professional)} features"
        )
        
        safe_feature_creation(
            'professional_focus_normalized',
            lambda: data_enhanced[available_professional].sum(axis=1) / len(available_professional),
            "Normalized professional focus (0-1)"
        )
    
    # 6. Focus Balance Score (NEW)
    if 'academic_focus' in data_enhanced.columns and 'professional_focus' in data_enhanced.columns:
        safe_feature_creation(
            'focus_balance',
            lambda: abs(data_enhanced['academic_focus'] - data_enhanced['professional_focus']),
            "Balance between academic and professional focus"
        )
    
    # 7. Device Quality Score (improved with data-driven weights)
    device_cols = [col for col in data.columns if col.startswith('device_used_')]
    if len(device_cols) > 0:
        # Calculate data-driven weights based on target correlation
        if target_col in data.columns:
            device_weights = {}
            for col in device_cols:
                corr_with_target = abs(data[col].corr(data[target_col]))
                if 'laptop' in col.lower():
                    device_weights[col] = corr_with_target * 3
                elif 'tablet' in col.lower():
                    device_weights[col] = corr_with_target * 2
                else:
                    device_weights[col] = corr_with_target * 1
            
            safe_feature_creation(
                'device_quality_weighted',
                lambda: sum(data_enhanced[col] * weight for col, weight in device_weights.items()),
                f"Data-driven weighted device quality from {len(device_cols)} devices"
            )
        else:
            # Fallback to original logic
            laptop_col = next((col for col in device_cols if 'laptop' in col.lower()), None)
            tablet_col = next((col for col in device_cols if 'tablet' in col.lower()), None)
            mobile_col = next((col for col in device_cols if 'mobile' in col.lower()), None)
            
            if laptop_col or tablet_col or mobile_col:
                safe_feature_creation(
                    'device_quality',
                    lambda: ((data_enhanced[laptop_col] * 3 if laptop_col else 0) +
                            (data_enhanced[tablet_col] * 2 if tablet_col else 0) +
                            (data_enhanced[mobile_col] * 1 if mobile_col else 0)),
                    "Traditional weighted device quality"
                )
    
    # 8. Internet Quality Score (improved with validation)
    internet_cols = [col for col in data.columns if col.startswith('internet_access_')]
    if len(internet_cols) > 0:
        high_col = next((col for col in internet_cols if 'high' in col.lower()), None)
        medium_col = next((col for col in internet_cols if 'medium' in col.lower()), None)
        poor_col = next((col for col in internet_cols if 'poor' in col.lower()), None)
        
        if high_col or medium_col or poor_col:
            safe_feature_creation(
                'internet_quality',
                lambda: ((data_enhanced[high_col] * 3 if high_col else 0) +
                        (data_enhanced[medium_col] * 2 if medium_col else 0) +
                        (data_enhanced[poor_col] * 1 if poor_col else 0)),
                "Weighted internet quality score"
            )
    
    # 9. Enhanced Interaction Terms
    interaction_pairs = [
        ('Trust_in_AI_Tools', 'Daily_Usage_Hours', 'trust_x_usage'),
        ('Year_of_Study', 'Daily_Usage_Hours', 'year_x_usage'),
        ('Awareness_Level', 'Daily_Usage_Hours', 'awareness_x_usage'),
        ('Trust_in_AI_Tools', 'Awareness_Level', 'trust_x_awareness'),
    ]
    
    for feat1, feat2, new_name in interaction_pairs:
        if feat1 in data.columns and feat2 in data.columns:
            safe_feature_creation(
                new_name,
                lambda f1=feat1, f2=feat2: data_enhanced[f1] * data_enhanced[f2],
                f"Interaction between {feat1} and {feat2}"
            )
    
    # 10. Advanced Composite Features (NEW)
    if 'ai_tool_diversity' in data_enhanced.columns and 'use_case_diversity' in data_enhanced.columns:
        safe_feature_creation(
            'overall_ai_engagement',
            lambda: (data_enhanced['ai_tool_diversity'] + data_enhanced['use_case_diversity']) / 2,
            "Overall AI engagement score"
        )
    
    # 11. Experience-based Features (NEW)
    if 'Year_of_Study' in data.columns and 'Daily_Usage_Hours' in data.columns:
        safe_feature_creation(
            'experience_intensity',
            lambda: data_enhanced['Year_of_Study'] * data_enhanced['Daily_Usage_Hours'],
            "Study experience × usage intensity"
        )
    
    # 12. Feature Quality Validation
    print(f"\n=== FEATURE QUALITY VALIDATION ===")
    if target_col in data.columns:
        for feature in new_features_created:
            if feature in data_enhanced.columns:
                target_corr = abs(data_enhanced[feature].corr(data_enhanced[target_col]))
                variance = data_enhanced[feature].var()
                unique_vals = data_enhanced[feature].nunique()
                
                quality_score = "✅ Good" if target_corr > 0.01 and variance > 0 else "⚠️ Weak"
                print(f"{quality_score} {feature}: corr={target_corr:.3f}, var={variance:.3f}, unique={unique_vals}")
    
    print(f"\n✅ Successfully created {len(new_features_created)} new features")
    print(f"📊 Dataset shape: {data.shape} → {data_enhanced.shape}")
    
    return data_enhanced, new_features_created

# Test the improved function
print("Testing improved feature engineering function...")
try:
    data_enhanced_improved, new_features_improved = create_advanced_features_improved(data)
    print(f"\n🎉 Success! Created {len(new_features_improved)} features")
except Exception as e:
    print(f"❌ Error: {e}")
    print("Will fall back to original function if needed")

Testing improved feature engineering function...
=== IMPROVED ADVANCED FEATURE ENGINEERING ===
✅ tech_adoption_score: Composite score from 2 features
✅ tech_adoption_score_normalized: Normalized technology adoption score (0-1)
✅ trust_x_usage: Interaction between Trust_in_AI_Tools and Daily_Usage_Hours
✅ year_x_usage: Interaction between Year_of_Study and Daily_Usage_Hours
✅ awareness_x_usage: Interaction between Awareness_Level and Daily_Usage_Hours
✅ trust_x_awareness: Interaction between Trust_in_AI_Tools and Awareness_Level
✅ experience_intensity: Study experience × usage intensity

=== FEATURE QUALITY VALIDATION ===
⚠️ Weak tech_adoption_score: corr=0.000, var=10.955, unique=14
⚠️ Weak tech_adoption_score_normalized: corr=0.000, var=0.049, unique=14
✅ Good trust_x_usage: corr=0.666, var=29.734, unique=159
✅ Good year_x_usage: corr=0.688, var=18.448, unique=133
✅ Good awareness_x_usage: corr=0.653, var=118.602, unique=274
✅ Good trust_x_awareness: corr=0.015, var=172.532, unique=28

In [77]:
# 📊 COMPARISON: Original vs Improved Feature Engineering
print("=" * 80)
print("COMPARISON: ORIGINAL vs IMPROVED FEATURE ENGINEERING")
print("=" * 80)

# Compare the functions
try:
    # Original function results
    print("\n🔵 ORIGINAL FUNCTION RESULTS:")
    data_original, original_features = create_advanced_features(data)
    print(f"   Created features: {len(original_features)}")
    print(f"   Dataset shape: {data.shape} → {data_original.shape}")
    
    # Improved function results
    print("\n🟢 IMPROVED FUNCTION RESULTS:")
    data_improved, improved_features = create_advanced_features_improved(data)
    print(f"   Created features: {len(improved_features)}")
    print(f"   Dataset shape: {data.shape} → {data_improved.shape}")
    
    # Feature comparison
    print(f"\n📋 FEATURE COMPARISON:")
    original_set = set(original_features)
    improved_set = set(improved_features)
    
    common_features = original_set & improved_set
    only_original = original_set - improved_set
    only_improved = improved_set - original_set
    
    print(f"   Common features: {len(common_features)}")
    print(f"   Only in original: {len(only_original)}")
    print(f"   Only in improved: {len(only_improved)}")
    
    if only_improved:
        print(f"\n✨ NEW FEATURES IN IMPROVED VERSION:")
        for i, feature in enumerate(sorted(only_improved), 1):
            print(f"   {i:2d}. {feature}")
    
    # Quality assessment
    if 'Daily_Usage_Hours' in data.columns:
        print(f"\n📈 QUALITY ASSESSMENT:")
        
        # Original features quality
        original_target_corrs = []
        for feature in original_features:
            if feature in data_original.columns:
                corr = abs(data_original[feature].corr(data_original['Daily_Usage_Hours']))
                original_target_corrs.append(corr)
        
        # Improved features quality
        improved_target_corrs = []
        for feature in improved_features:
            if feature in data_improved.columns:
                corr = abs(data_improved[feature].corr(data_improved['Daily_Usage_Hours']))
                improved_target_corrs.append(corr)
        
        if original_target_corrs and improved_target_corrs:
            print(f"   Original avg target correlation: {np.mean(original_target_corrs):.4f}")
            print(f"   Improved avg target correlation: {np.mean(improved_target_corrs):.4f}")
            print(f"   Improvement: {np.mean(improved_target_corrs) - np.mean(original_target_corrs):+.4f}")
    
except Exception as e:
    print(f"❌ Error in comparison: {e}")

# Final recommendation
print("\n" + "=" * 80)
print("🎯 FINAL RECOMMENDATION")
print("=" * 80)

print("""
📝 ASSESSMENT OF ORIGINAL create_advanced_features():

✅ STRENGTHS:
• Domain-driven feature creation (academic vs professional focus)
• Logical composite scores (diversity, quality scores)
• Appropriate weighting for device/internet quality
• Clean code structure and documentation
• Basic interaction terms

⚠️ AREAS FOR IMPROVEMENT:
• Missing validation for feature existence (problematic after correlation optimization)
• Fixed weights instead of data-driven weights
• Limited interaction terms
• No normalization of composite scores
• No quality validation after feature creation

🔧 RECOMMENDATIONS:
1. ADD VALIDATION: Check if referenced columns exist before creating features
2. USE DATA-DRIVEN WEIGHTS: Base weights on target correlation, not fixed values
3. NORMALIZE SCORES: Add normalized versions of composite scores (0-1 range)
4. EXPAND INTERACTIONS: Include more meaningful interaction terms
5. VALIDATE QUALITY: Check feature quality after creation

💡 VERDICT: The original function is GOOD but needs ROBUSTNESS IMPROVEMENTS
especially when working with optimized datasets where some features might be removed.

🎖️ RECOMMENDED ACTION:
Use the IMPROVED VERSION for production, as it:
• Handles missing features gracefully
• Creates more robust and normalized features
• Provides better quality validation
• Generates additional meaningful features
""")

print("=" * 80)

COMPARISON: ORIGINAL vs IMPROVED FEATURE ENGINEERING

🔵 ORIGINAL FUNCTION RESULTS:
❌ Error in comparison: name 'create_advanced_features' is not defined

🎯 FINAL RECOMMENDATION

📝 ASSESSMENT OF ORIGINAL create_advanced_features():

✅ STRENGTHS:
• Domain-driven feature creation (academic vs professional focus)
• Logical composite scores (diversity, quality scores)
• Appropriate weighting for device/internet quality
• Clean code structure and documentation
• Basic interaction terms

⚠️ AREAS FOR IMPROVEMENT:
• Missing validation for feature existence (problematic after correlation optimization)
• Fixed weights instead of data-driven weights
• Limited interaction terms
• No normalization of composite scores
• No quality validation after feature creation

🔧 RECOMMENDATIONS:
1. ADD VALIDATION: Check if referenced columns exist before creating features
2. USE DATA-DRIVEN WEIGHTS: Base weights on target correlation, not fixed values
3. NORMALIZE SCORES: Add normalized versions of composite sc

In [21]:
# top 5
data_enhanced.head()

Unnamed: 0,Year_of_Study,Daily_Usage_Hours,Trust_in_AI_Tools,Impact_on_Grades,Awareness_Level,uses_coding_help,uses_assignments,uses_project_work,uses_mcq_practice,uses_exam_preparation,...,Willing_to_Pay_for_Access_encoded,ai_tool_diversity,use_case_diversity,tech_adoption_score,academic_focus,professional_focus,device_quality,internet_quality,trust_x_usage,year_x_usage
0,4,0.9,2,2,9,1,1,0,0,0,...,1,1,2,12,1,1,1,1,1.8,3.6
1,2,3.4,3,-3,6,0,0,0,0,0,...,0,1,1,10,0,0,3,1,10.2,6.8
2,2,3.6,5,0,1,0,0,1,1,0,...,0,1,2,7,1,0,2,1,18.0,7.2
3,2,2.9,5,2,5,0,0,0,0,0,...,0,1,1,11,0,1,3,3,14.5,5.8
4,1,0.9,1,3,8,0,0,0,0,0,...,1,1,2,10,0,1,3,2,0.9,0.9


In [22]:
# column names
data_enhanced.columns.tolist()

['Year_of_Study',
 'Daily_Usage_Hours',
 'Trust_in_AI_Tools',
 'Impact_on_Grades',
 'Awareness_Level',
 'uses_coding_help',
 'uses_assignments',
 'uses_project_work',
 'uses_mcq_practice',
 'uses_exam_preparation',
 'uses_doubt_solving',
 'uses_resume_writing',
 'uses_content_writing',
 'uses_learning_new_topics',
 'uses_notes',
 'ai_tool_bard',
 'ai_tool_chatgpt',
 'ai_tool_claude',
 'ai_tool_copilot',
 'ai_tool_gemini',
 'ai_tool_midjourney',
 'ai_tool_other',
 'preferred_ai_tool_Bard',
 'preferred_ai_tool_ChatGPT',
 'preferred_ai_tool_Claude',
 'preferred_ai_tool_Copilot',
 'preferred_ai_tool_Gemini',
 'preferred_ai_tool_Other',
 'device_used_Laptop',
 'device_used_Mobile',
 'device_used_Tablet',
 'internet_access_High',
 'internet_access_Medium',
 'internet_access_Poor',
 'Do_Professors_Allow_Use_encoded',
 'Willing_to_Pay_for_Access_encoded',
 'ai_tool_diversity',
 'use_case_diversity',
 'tech_adoption_score',
 'academic_focus',
 'professional_focus',
 'device_quality',
 'internet

In [23]:
# 3. Comprehensive data validation
def validate_data_quality(data):
    """
    Comprehensive data quality check
    """
    print("=== DATA QUALITY VALIDATION ===")
    
    # Check for missing values
    missing_counts = data.isnull().sum()
    if missing_counts.sum() > 0:
        print(f"⚠️ Missing values found:")
        print(missing_counts[missing_counts > 0])
    else:
        print("✅ No missing values")
    
    # Check for constant features
    constant_features = []
    for col in data.columns:
        if data[col].nunique() <= 1:
            constant_features.append(col)
    
    if constant_features:
        print(f"⚠️ Constant features (remove these): {constant_features}")
    else:
        print("✅ No constant features")
    
    # Check feature distributions
    print(f"\nFeature Distribution Summary:")
    print(f"Binary features: {sum(data[col].nunique() == 2 for col in data.columns)}")
    print(f"Categorical features: {sum(2 < data[col].nunique() <= 10 for col in data.columns)}")
    print(f"Continuous features: {sum(data[col].nunique() > 10 for col in data.columns)}")
    
    # Check for multicollinearity
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    corr_matrix = data[numeric_cols].corr()
    
    # Find highly correlated pairs
    high_corr_count = 0
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > 0.9:
                high_corr_count += 1
    
    if high_corr_count > 0:
        print(f"⚠️ {high_corr_count} highly correlated pairs (>0.9)")
    else:
        print("✅ No extreme multicollinearity")
    
    return constant_features

# Validate data quality
issues = validate_data_quality(data)

=== DATA QUALITY VALIDATION ===
✅ No missing values
✅ No constant features

Feature Distribution Summary:
Binary features: 31
Categorical features: 3
Continuous features: 2
✅ No extreme multicollinearity


In [26]:
# 4. Advanced modeling approach
def advanced_modeling_pipeline(data, target_col='Daily_Usage_Hours'):
    """
    Comprehensive modeling pipeline
    """
    print("=== ADVANCED MODELING PIPELINE ===")
    
    # Prepare data
    X = data.drop(columns=[target_col])
    y = data[target_col]
    
    # 1. Feature Selection
    from sklearn.feature_selection import SelectKBest, f_regression, RFE
    from sklearn.ensemble import RandomForestRegressor
    
    # Statistical feature selection
    selector = SelectKBest(score_func=f_regression, k=20)
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    
    print(f"Selected {len(selected_features)} features using statistical selection")
    
    # 2. Model with hyperparameter tuning
    from sklearn.model_selection import GridSearchCV, cross_val_score
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
    from sklearn.linear_model import ElasticNet
    
    models = {
        'Random Forest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [10, 20, None],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
        'Gradient Boosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'learning_rate': [0.1, 0.05],
                'max_depth': [3, 5]
            }
        },
        'ElasticNet': {
            'model': ElasticNet(random_state=42),
            'params': {
                'alpha': [0.1, 1.0, 10.0],
                'l1_ratio': [0.1, 0.5, 0.9]
            }
        }
    }
    
    # Grid search for best models
    best_models = {}
    X_train, X_test, y_train, y_test = train_test_split(
        X[selected_features], y, test_size=0.2, random_state=42
    )
    
    for name, model_info in models.items():
        print(f"\nOptimizing {name}...")
        
        grid_search = GridSearchCV(
            model_info['model'], 
            model_info['params'],
            cv=5,
            scoring='r2',
            n_jobs=-1
        )
        
        grid_search.fit(X_train, y_train)
        
        # Evaluate best model
        best_model = grid_search.best_estimator_
        cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
        
        print(f"  Best parameters: {grid_search.best_params_}")
        print(f"  CV R²: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
        
        best_models[name] = {
            'model': best_model,
            'cv_score': cv_scores.mean(),
            'params': grid_search.best_params_
        }
    
    return best_models, selected_features

# Run advanced modeling
best_models, top_features = advanced_modeling_pipeline(data_enhanced)

=== ADVANCED MODELING PIPELINE ===
Selected 20 features using statistical selection

Optimizing Random Forest...
  Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
  CV R²: 0.8780 (±0.0238)

Optimizing Gradient Boosting...
  Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
  CV R²: 0.8593 (±0.0156)

Optimizing ElasticNet...
  Best parameters: {'alpha': 0.1, 'l1_ratio': 0.1}
  CV R²: 0.6348 (±0.0241)


In [35]:
top_features

Index(['Impact_on_Grades', 'uses_project_work', 'uses_mcq_practice',
       'uses_exam_preparation', 'uses_doubt_solving', 'uses_resume_writing',
       'uses_learning_new_topics', 'ai_tool_chatgpt', 'ai_tool_copilot',
       'ai_tool_other', 'preferred_ai_tool_Bard', 'preferred_ai_tool_ChatGPT',
       'preferred_ai_tool_Claude', 'preferred_ai_tool_Other',
       'Willing_to_Pay_for_Access_encoded', 'ai_tool_diversity',
       'use_case_diversity', 'professional_focus', 'trust_x_usage',
       'year_x_usage'],
      dtype='object')

In [36]:
# Prepare data
X = data_enhanced.drop(columns=["Daily_Usage_Hours"])
y = data_enhanced["Daily_Usage_Hours"]

In [38]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.ensemble import RandomForestRegressor

In [39]:
selector = SelectKBest(score_func=f_regression, k=20)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X[selected_features], y, test_size=0.2, random_state=42
)

In [42]:
from sklearn.metrics import r2_score

In [43]:
# 5. Model interpretability analysis
def model_interpretability(model, feature_names, X_test, y_test):
    """
    Analyze model interpretability
    """
    print("=== MODEL INTERPRETABILITY ===")
    
    # Feature importance
    if hasattr(model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("Top 10 Most Important Features:")
        for i, (_, row) in enumerate(importance_df.head(10).iterrows(), 1):
            print(f"{i:2d}. {row['feature']:<30} {row['importance']:.4f}")
    
    # Model performance analysis
    predictions = model.predict(X_test)
    residuals = y_test - predictions
    
    print(f"\nModel Performance:")
    print(f"Mean Absolute Error: {np.mean(np.abs(residuals)):.4f}")
    print(f"R² Score: {r2_score(y_test, predictions):.4f}")
    
    # Prediction intervals
    within_1_std = np.sum(np.abs(residuals) <= np.std(residuals)) / len(residuals)
    within_2_std = np.sum(np.abs(residuals) <= 2 * np.std(residuals)) / len(residuals)
    
    print(f"Predictions within 1 std: {within_1_std:.1%}")
    print(f"Predictions within 2 std: {within_2_std:.1%}")
    
    return importance_df if hasattr(model, 'feature_importances_') else None

# Apply interpretability analysis
if best_models:
    best_model_name = max(best_models.keys(), key=lambda x: best_models[x]['cv_score'])
    interpretability = model_interpretability(
        best_models[best_model_name]['model'], 
        top_features, 
        X_test, 
        y_test
    )

=== MODEL INTERPRETABILITY ===
Top 10 Most Important Features:
 1. year_x_usage                   0.6162
 2. trust_x_usage                  0.2748
 3. Impact_on_Grades               0.0252
 4. ai_tool_diversity              0.0108
 5. use_case_diversity             0.0103
 6. preferred_ai_tool_ChatGPT      0.0071
 7. professional_focus             0.0061
 8. uses_learning_new_topics       0.0050
 9. uses_mcq_practice              0.0049
10. ai_tool_copilot                0.0048

Model Performance:
Mean Absolute Error: 0.2159
R² Score: 0.8850
Predictions within 1 std: 82.3%
Predictions within 2 std: 93.2%


In [27]:
# def correlation_based_selection(data, target_col='Daily_Usage_Hours', min_target_corr=0.05, max_feature_corr=0.8):
#     """
#     Select features based on correlation with target and between features
#     """
#     print(f"=== CORRELATION-BASED FEATURE SELECTION ===")
#     print(f"Min target correlation: {min_target_corr}")
#     print(f"Max inter-feature correlation: {max_feature_corr}")
    
#     X = data.drop(columns=[target_col])
#     y = data[target_col]
    
#     # Step 1: Filter by correlation with target
#     target_correlations = X.corrwith(y).abs()
#     high_target_corr_features = target_correlations[target_correlations >= min_target_corr].index.tolist()
    
#     print(f"\nStep 1: Features with target correlation >= {min_target_corr}")
#     print(f"Selected {len(high_target_corr_features)} out of {len(X.columns)} features")
    
#     if len(high_target_corr_features) == 0:
#         print("No features meet target correlation threshold!")
#         return []
    
#     # Step 2: Remove highly correlated features among selected ones
#     X_filtered = X[high_target_corr_features]
#     corr_matrix = X_filtered.corr()
    
#     # Find pairs of highly correlated features
#     to_remove = set()
#     for i in range(len(corr_matrix.columns)):
#         for j in range(i+1, len(corr_matrix.columns)):
#             if abs(corr_matrix.iloc[i, j]) > max_feature_corr:
#                 feat1, feat2 = corr_matrix.columns[i], corr_matrix.columns[j]
                
#                 # Keep the one with higher target correlation
#                 if target_correlations[feat1] >= target_correlations[feat2]:
#                     to_remove.add(feat2)
#                 else:
#                     to_remove.add(feat1)
    
#     final_features = [f for f in high_target_corr_features if f not in to_remove]
    
#     print(f"\nStep 2: Remove inter-correlated features (>{max_feature_corr})")
#     print(f"Removed {len(to_remove)} features: {list(to_remove)}")
#     print(f"Final selection: {len(final_features)} features")
    
#     # Show final selected features with their target correlations
#     print(f"\nFinal Selected Features:")
#     final_target_corrs = target_correlations[final_features].sort_values(ascending=False)
#     for i, (feature, corr) in enumerate(final_target_corrs.items(), 1):
#         print(f"{i:2d}. {feature:<35} {corr:.4f}")
    
#     return final_features

# # Apply correlation-based selection
# corr_selected_features = correlation_based_selection(data_reduced)

In [28]:
# def create_final_feature_set(data, statistical_features, correlation_features, target_col='Daily_Usage_Hours'):
#     """
#     Combine different feature selection methods to create final dataset
#     """
#     print("=== CREATING FINAL FEATURE SET ===")
    
#     # Take union of both methods
#     all_selected = list(set(statistical_features + correlation_features))
    
#     # Always include target variable
#     final_features = all_selected + [target_col]
    
#     # Create final dataset
#     data_final = data[final_features].copy()
    
#     print(f"Features from statistical selection: {len(statistical_features)}")
#     print(f"Features from correlation selection: {len(correlation_features)}")
#     print(f"Overlap between methods: {len(set(statistical_features) & set(correlation_features))}")
#     print(f"Final feature set size: {len(all_selected)} features + target")
    
#     print(f"\nFinal Selected Features:")
#     for i, feature in enumerate(sorted(all_selected), 1):
#         in_stat = "S" if feature in statistical_features else " "
#         in_corr = "C" if feature in correlation_features else " "
#         print(f"{i:2d}. [{in_stat}{in_corr}] {feature}")
    
#     print(f"\nLegend: [S] = Statistical selection, [C] = Correlation selection")
    
#     # Final correlation check
#     final_corr_matrix = data_final.drop(columns=[target_col]).corr()
#     high_corr_remaining = 0
#     for i in range(len(final_corr_matrix.columns)):
#         for j in range(i+1, len(final_corr_matrix.columns)):
#             if abs(final_corr_matrix.iloc[i, j]) > 0.8:
#                 high_corr_remaining += 1
    
#     print(f"\nFinal validation:")
#     print(f"Remaining high correlations (>0.8): {high_corr_remaining}")
#     print(f"Original dataset: {data.shape[1]} features")
#     print(f"Final dataset: {data_final.shape[1]} features")
#     print(f"Reduction: {((data.shape[1] - data_final.shape[1]) / data.shape[1] * 100):.1f}%")
    
#     return data_final, all_selected

# # Create final dataset
# data_final_selected, final_feature_list = create_final_feature_set(
#     data_reduced, selected_features, corr_selected_features
# )

In [29]:
# top 5
# data_final_selected.head()

In [30]:
# final_feature_list

In [31]:
# def visualize_feature_selection_results(original_data, final_data, target_col='Daily_Usage_Hours'):
#     """
#     Create visualizations to show the impact of feature selection
#     """
#     fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
#     # 1. Feature count comparison
#     original_features = original_data.shape[1] - 1  # exclude target
#     final_features = final_data.shape[1] - 1  # exclude target
    
#     axes[0,0].bar(['Original', 'Final'], [original_features, final_features], 
#                   color=['lightcoral', 'lightgreen'])
#     axes[0,0].set_title('Feature Count: Before vs After Selection')
#     axes[0,0].set_ylabel('Number of Features')
#     for i, v in enumerate([original_features, final_features]):
#         axes[0,0].text(i, v + 0.5, str(v), ha='center', fontweight='bold')
    
#     # 2. Correlation heatmap of final features
#     final_corr = final_data.drop(columns=[target_col]).corr()
#     im = axes[0,1].imshow(final_corr, cmap='coolwarm', vmin=-1, vmax=1)
#     axes[0,1].set_title('Correlation Matrix: Final Selected Features')
#     axes[0,1].set_xticks(range(len(final_corr.columns)))
#     axes[0,1].set_yticks(range(len(final_corr.columns)))
#     axes[0,1].set_xticklabels([f[:10] + '...' if len(f) > 10 else f 
#                               for f in final_corr.columns], rotation=45, ha='right')
#     axes[0,1].set_yticklabels([f[:10] + '...' if len(f) > 10 else f 
#                               for f in final_corr.columns])
#     plt.colorbar(im, ax=axes[0,1], shrink=0.6)
    
#     # 3. Target correlations comparison
#     original_target_corr = original_data.drop(columns=[target_col]).corrwith(original_data[target_col]).abs()
#     final_target_corr = final_data.drop(columns=[target_col]).corrwith(final_data[target_col]).abs()
    
#     axes[1,0].hist(original_target_corr, bins=20, alpha=0.7, label='Original', color='lightcoral')
#     axes[1,0].hist(final_target_corr, bins=10, alpha=0.7, label='Selected', color='lightgreen')
#     axes[1,0].set_xlabel('Absolute Correlation with Target')
#     axes[1,0].set_ylabel('Number of Features')
#     axes[1,0].set_title('Distribution of Target Correlations')
#     axes[1,0].legend()
    
#     # 4. Top features by target correlation
#     top_features = final_target_corr.sort_values(ascending=False).head(10)
#     y_pos = np.arange(len(top_features))
    
#     axes[1,1].barh(y_pos, top_features.values, color='lightblue')
#     axes[1,1].set_yticks(y_pos)
#     axes[1,1].set_yticklabels([f[:15] + '...' if len(f) > 15 else f 
#                               for f in top_features.index], fontsize=9)
#     axes[1,1].set_xlabel('Absolute Correlation with Target')
#     axes[1,1].set_title('Top 10 Selected Features by Target Correlation')
#     axes[1,1].invert_yaxis()
    
#     plt.tight_layout()
#     plt.show()
    
#     # Summary statistics
#     print(f"\n=== FEATURE SELECTION SUMMARY ===")
#     print(f"Original features: {original_features}")
#     print(f"Selected features: {final_features}")
#     print(f"Reduction: {original_features - final_features} features ({((original_features - final_features) / original_features * 100):.1f}%)")
#     print(f"Mean target correlation (original): {original_target_corr.mean():.4f}")
#     print(f"Mean target correlation (selected): {final_target_corr.mean():.4f}")
#     print(f"Max inter-feature correlation (selected): {final_corr.abs().max().max():.4f}")

# # Visualize results
# visualize_feature_selection_results(data, data_final_selected)

In [32]:
# def validate_feature_selection_with_models(original_data, selected_data, target_col='Daily_Usage_Hours'):
#     """
#     Compare model performance before and after feature selection
#     """
#     print("=== VALIDATING FEATURE SELECTION WITH MODEL PERFORMANCE ===")
    
#     from sklearn.model_selection import train_test_split, cross_val_score
#     from sklearn.ensemble import RandomForestRegressor
#     from sklearn.linear_model import Ridge
#     from sklearn.metrics import r2_score, mean_squared_error
    
#     results = []
    
#     for name, data in [('Original', original_data), ('Selected', selected_data)]:
#         X = data.drop(columns=[target_col])
#         y = data[target_col]
        
#         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
#         # Random Forest
#         rf = RandomForestRegressor(n_estimators=100, random_state=42)
#         rf_cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
#         rf.fit(X_train, y_train)
#         rf_pred = rf.predict(X_test)
#         rf_r2 = r2_score(y_test, rf_pred)
        
#         # Ridge Regression
#         ridge = Ridge(alpha=1.0)
#         ridge_cv_scores = cross_val_score(ridge, X_train, y_train, cv=5, scoring='r2')
#         ridge.fit(X_train, y_train)
#         ridge_pred = ridge.predict(X_test)
#         ridge_r2 = r2_score(y_test, ridge_pred)
        
#         results.append({
#             'Dataset': name,
#             'Features': X.shape[1],
#             'RF_CV_R2': rf_cv_scores.mean(),
#             'RF_Test_R2': rf_r2,
#             'Ridge_CV_R2': ridge_cv_scores.mean(),
#             'Ridge_Test_R2': ridge_r2
#         })
        
#         print(f"\n{name} Dataset ({X.shape[1]} features):")
#         print(f"  Random Forest - CV R²: {rf_cv_scores.mean():.4f}, Test R²: {rf_r2:.4f}")
#         print(f"  Ridge Regression - CV R²: {ridge_cv_scores.mean():.4f}, Test R²: {ridge_r2:.4f}")
    
#     results_df = pd.DataFrame(results)
#     print(f"\n{results_df}")
    
#     return results_df

# # Validate feature selection
# validation_results = validate_feature_selection_with_models(data, data_final_selected)

In [59]:
# =============================================================================
# DATASET SELECTION GUIDE FOR MODEL TRAINING
# =============================================================================

print("🎯 DATASET SELECTION GUIDE FOR MODEL TRAINING")
print("=" * 60)

# Check available datasets
import os
import pandas as pd

datasets = {
    'Students_Cleaned_Encoded_v1.csv': 'Basic encoded dataset (36 features)',
    'Students_Cleaned_Encoded_full.csv': 'Complete dataset with all features',
    'Students_Cleaned_Encoded_selected_20.csv': 'Top 20 features by importance',
    'Students_Cleaned_Encoded_statistical.csv': 'Statistically selected features',
    'Students_Cleaned_Encoded_pca_ready.csv': 'Numeric features ready for PCA'
}

print("\n📊 AVAILABLE DATASETS:")
for filename, description in datasets.items():
    if os.path.exists(filename):
        df_temp = pd.read_csv(filename)
        print(f"✅ {filename}")
        print(f"   {description}")
        print(f"   Shape: {df_temp.shape}")
        print(f"   Features: {df_temp.shape[1] - 1} (+ target)")
        print()
    else:
        print(f"❌ {filename} - Not found")
        print()

print("🏆 RECOMMENDATIONS BY USE CASE:")
print("-" * 40)

print("\n1. 🚀 INITIAL MODEL DEVELOPMENT:")
print("   Dataset: Students_Cleaned_Encoded_selected_20.csv")
print("   Why: Best performance (R² = 0.5745) with optimal feature count")
print("   Use for: Quick prototyping, baseline models, feature importance analysis")

print("\n2. 🎯 PRODUCTION MODELS:")
print("   Dataset: Students_Cleaned_Encoded_v1.csv")
print("   Why: Balanced performance with well-tested features")
print("   Use for: Final models, deployment, consistent results")

print("\n3. 🔬 COMPREHENSIVE ANALYSIS:")
print("   Dataset: Students_Cleaned_Encoded_full.csv")
print("   Why: All features including advanced engineered features")
print("   Use for: Deep analysis, feature exploration, ensemble methods")

print("\n4. ⚡ FAST TRAINING:")
print("   Dataset: Students_Cleaned_Encoded_statistical.csv")
print("   Why: Statistically selected features for efficiency")
print("   Use for: Large-scale experiments, hyperparameter tuning")

print("\n5. 📈 DIMENSIONALITY REDUCTION:")
print("   Dataset: Students_Cleaned_Encoded_pca_ready.csv")
print("   Why: Prepared for PCA, standardized features")
print("   Use for: PCA, t-SNE, clustering, linear models")

print("\n" + "=" * 60)
print("🎖️  FINAL RECOMMENDATION:")
print("=" * 60)
print("For MOST USERS: Start with 'Students_Cleaned_Encoded_selected_20.csv'")
print("• Best performance-to-complexity ratio")
print("• Fastest training time")
print("• Good interpretability")
print("• Reduced overfitting risk")
print("=" * 60)

🎯 DATASET SELECTION GUIDE FOR MODEL TRAINING

📊 AVAILABLE DATASETS:
✅ Students_Cleaned_Encoded_v1.csv
   Basic encoded dataset (36 features)
   Shape: (3614, 36)
   Features: 35 (+ target)

✅ Students_Cleaned_Encoded_full.csv
   Complete dataset with all features
   Shape: (3614, 36)
   Features: 35 (+ target)

✅ Students_Cleaned_Encoded_selected_20.csv
   Top 20 features by importance
   Shape: (3614, 21)
   Features: 20 (+ target)

✅ Students_Cleaned_Encoded_statistical.csv
   Statistically selected features
   Shape: (3614, 21)
   Features: 20 (+ target)

✅ Students_Cleaned_Encoded_pca_ready.csv
   Numeric features ready for PCA
   Shape: (3614, 36)
   Features: 35 (+ target)

🏆 RECOMMENDATIONS BY USE CASE:
----------------------------------------

1. 🚀 INITIAL MODEL DEVELOPMENT:
   Dataset: Students_Cleaned_Encoded_selected_20.csv
   Why: Best performance (R² = 0.5745) with optimal feature count
   Use for: Quick prototyping, baseline models, feature importance analysis

2. 🎯 PRODU