In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
from hashlib import md5

# Read the original dataset
df = pd.read_csv('B.Tech Student Profile Survey.csv')

# 1. Display original dataset info
print("Original Dataset Info:")
print("-" * 50)
display(df.info())

# 2. Clean and anonymize the dataset
def clean_dataset(df):
    # Create a copy of the dataframe
    cleaned_df = df.copy()
    
    # Remove unnamed columns
    cleaned_df = cleaned_df.drop(columns=[col for col in cleaned_df.columns if 'Unnamed' in col])
    
    # Anonymize personal information
    def anonymize_text(text):
        if pd.isna(text):
            return text
        return md5(str(text).encode()).hexdigest()[:8]
    
    # Anonymize personal data
    cleaned_df['Full Name'] = cleaned_df['Full Name'].apply(lambda x: f"Student_{anonymize_text(x)}")
    cleaned_df['Email ID'] = cleaned_df['Email ID'].apply(lambda x: f"email_{anonymize_text(x)}@anonymous.com")
    cleaned_df['Mobile Number'] = cleaned_df['Mobile Number'].apply(lambda x: f"xxxxxx{str(x)[-4:]}")
    
    # Standardize text columns
    text_columns = ['Gender', 'Branch', 'Year of Study', 'Have you done any internships?']
    for col in text_columns:
        cleaned_df[col] = cleaned_df[col].str.strip().str.title()
    
    # Convert CGPA and Study Hours to numeric, replacing invalid values with NaN
    cleaned_df['Current CGPA (out of 10)'] = pd.to_numeric(cleaned_df['Current CGPA (out of 10)'], errors='coerce')
    cleaned_df['Average Study Hours per Week'] = pd.to_numeric(cleaned_df['Average Study Hours per Week'], errors='coerce')
    
    # Clean and categorize challenges faced during study
    def clean_challenges(text):
        if pd.isna(text):
            return "Not Specified"
        
        text = str(text).lower().strip()
        
        # Define categories and their keywords
        categories = {
            'Time Management': ['time', 'schedule', 'managing', 'consistency', 'regular'],
            'Concentration': ['concentration', 'focus', 'distraction', 'attention'],
            'Academic Difficulty': ['understanding', 'concepts', 'difficult', 'complexity'],
            'Study-Life Balance': ['balance', 'pressure', 'stress', 'workload'],
            'Technical Issues': ['technical', 'internet', 'online', 'system'],
            'Motivation': ['motivation', 'interest', 'boring', 'procrastination']
        }
        
        for category, keywords in categories.items():
            if any(keyword in text for keyword in keywords):
                return category
        
        return "Other"
    
    cleaned_df['Main Challenges Faced During Study'] = cleaned_df['Main Challenges Faced During Study'].apply(clean_challenges)
    
    # Clean and categorize suggestions
    def clean_suggestions(text):
        if pd.isna(text) or text.strip() in ['-', 'no', 'none', 'nil']:
            return "No Suggestion"
        
        text = str(text).lower().strip()
        
        # Define categories for suggestions
        categories = {
            'Practical Learning': ['practical', 'hands-on', 'project', 'industry'],
            'Teaching Methods': ['teaching', 'faculty', 'lecture', 'class'],
            'Infrastructure': ['infrastructure', 'facility', 'equipment', 'lab'],
            'Career Support': ['placement', 'career', 'internship', 'guidance'],
            'Curriculum': ['syllabus', 'course', 'curriculum', 'subject'],
            'Extra Activities': ['activity', 'club', 'competition', 'workshop']
        }
        
        for category, keywords in categories.items():
            if any(keyword in text for keyword in keywords):
                return category
        
        return "Other Suggestions"
    
    cleaned_df['Any Suggestions to Improve Academic or Career Support'] = \
        cleaned_df['Any Suggestions to Improve Academic or Career Support'].apply(clean_suggestions)
    
    # Replace missing values with appropriate values
    cleaned_df['Number of Technical Clubs or Competitions Participated In'].fillna(0, inplace=True)
    cleaned_df['Number of Certifications or Online Courses Completed'].fillna(0, inplace=True)
    
    return cleaned_df

# Apply cleaning
cleaned_df = clean_dataset(df)

# 3. Display cleaned dataset info
print("\nCleaned Dataset Info:")
print("-" * 50)
display(cleaned_df.info())

# 4. Show summary statistics of numerical columns
print("\nSummary Statistics of Numerical Columns:")
print("-" * 50)
display(cleaned_df.describe())

# 5. Display sample of cleaned data
print("\nSample of Cleaned Data:")
print("-" * 50)
display(cleaned_df.head())

# 6. Show changes made
print("\nChanges in Dataset:")
print("-" * 50)
print(f"Original shape: {df.shape}")
print(f"Cleaned shape: {cleaned_df.shape}")
print("\nColumns removed:", set(df.columns) - set(cleaned_df.columns))

# Save cleaned dataset
cleaned_df.to_csv('cleaned_BTech_Survey.csv', index=False)
print("\nCleaned dataset has been saved as 'cleaned_BTech_Survey.csv'")



