In [5]:
import pandas as pd

# Load dataset
df = pd.read_csv("variant_sample_25k.csv")

# Remove rows with missing alleles or diseases
df = df.dropna(subset=["ReferenceAllele", "AlternateAllele", "PhenotypeList"])

# Function to scan sequence for known mutations
def predict_disease_from_sequence(dna_seq, dataset):
    results = []
    dna_seq = dna_seq.upper()
    
    for _, row in dataset.iterrows():
        ref = row["ReferenceAllele"].upper()
        alt = row["AlternateAllele"].upper()
        disease = row["PhenotypeList"]

        # Check if reference allele is in sequence
        if ref in dna_seq:
            # Simulate mutation
            mutated_seq = dna_seq.replace(ref, alt, 1)
            results.append((ref, alt, disease, mutated_seq))
    
    return results

# Example usage
sample_sequence = "ATCGTACCGGTACAGCTGAACGATCGGCTAGTGGTCG"  # Replace with your DNA input
predictions = predict_disease_from_sequence(sample_sequence, df)

# Display results
if predictions:
    print("ðŸ§¬ Possible Mutations & Associated Diseases:\n")
    for ref, alt, disease, mutated_seq in predictions:
        print(f"Reference: {ref} -> Alternate: {alt}")
        print(f"Disease: {disease}")
        print(f"Mutated Sequence: {mutated_seq}\n")
else:
    print("âœ… No known disease-associated mutations found in the input DNA.")


ðŸ§¬ Possible Mutations & Associated Diseases:

Reference: CTG -> Alternate: NA
Disease: Spinocerebellar ataxia type 8
Mutated Sequence: ATCGTACCGGTACAGNAAACGATCGGCTAGTGGTCG

Reference: CTG -> Alternate: NA
Disease: Spinocerebellar ataxia type 8
Mutated Sequence: ATCGTACCGGTACAGNAAACGATCGGCTAGTGGTCG

Reference: T -> Alternate: C(2_7)
Disease: Aminoglycoside-induced deafness
Mutated Sequence: AC(2_7)CGTACCGGTACAGCTGAACGATCGGCTAGTGGTCG

Reference: T -> Alternate: C(2_7)
Disease: Aminoglycoside-induced deafness
Mutated Sequence: AC(2_7)CGTACCGGTACAGCTGAACGATCGGCTAGTGGTCG



In [2]:
# Preprocessing: One-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['GeneSymbol', 'Type'])
    ],
    remainder='passthrough'
)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier()))
])

pipeline.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimator,RandomForestClassifier()
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [3]:
def predict_disease_from_sequence(chrom, start, end, ref_allele, alt_allele, assembly):
    # Calculate variant characteristics
    variant_length = end - start + 1
    if ref_allele == "-":
        var_type = "Insertion"
    elif alt_allele == "-":
        var_type = "Deletion"
    elif len(ref_allele) == len(alt_allele):
        var_type = "SNV"
    else:
        var_type = "Indel"

    # Map to gene symbol (requires annotation database - simplified here)
    gene_symbol = "AP5Z1"  # In reality, use a tool like ANNOVAR or Ensembl API

    # Create input features
    input_data = pd.DataFrame([{
        'GeneSymbol': gene_symbol,
        'Type': var_type,
        'VariantLength': variant_length
    }])
    
    # Predict diseases
    probas = pipeline.predict_proba(input_data)
    predictions = {}
    for i, disease in enumerate(mlb.classes_):
        # Get probability for the positive class
        proba = probas[i][0][1] if hasattr(probas[i], 'ndim') else probas[i][0][0, 1]
        if proba > 0.5:  # Threshold can be adjusted
            predictions[disease] = proba
    
    return predictions

# Example usage:
user_input = {
    'chrom': 7,
    'start': 4820845,
    'end': 4820848,
    'ref_allele': 'ATG',
    'alt_allele': '-',  # Deletion
    'assembly': 'GRCh37'
}

result = predict_disease_from_sequence(**user_input)
print(result)  # Output: {'Hereditary spastic paraplegia 48': 0.85, ...}

IndexError: index 1 is out of bounds for axis 0 with size 1

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
import re
import warnings
warnings.filterwarnings('ignore')

class GeneticDiseasePredictor:
    def __init__(self):
        self.model = None
        self.label_encoders = {}
        self.mlb = MultiLabelBinarizer()
        self.scaler = StandardScaler()
        self.feature_columns = []
        self.training_stats = {}
        
    def clean_and_preprocess_dataset(self, df):
        """
        Comprehensive cleaning and preprocessing of the genetic variant dataset
        """
        print("Starting dataset preprocessing...")
        print(f"Initial dataset shape: {df.shape}")
        
        # Create a copy to avoid modifying original data
        df = df.copy()
        
        # Display initial data info
        print("\nInitial data types:")
        print(df.dtypes)
        print("\nMissing values per column:")
        print(df.isnull().sum())
        
        # 1. Handle missing values in required columns
        required_columns = ['Chromosome', 'Start', 'Stop', 'GeneSymbol', 'ClinicalSignificance', 'PhenotypeList', 'Type', 'Assembly']
        
        # Check if all required columns exist
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
        
        # 2. Clean numeric columns
        numeric_columns = ['Chromosome', 'Start', 'Stop']
        
        for col in numeric_columns:
            # Convert to string first to handle mixed types, then to numeric
            df[col] = df[col].astype(str).replace(['nan', 'NaN', 'None', ''], np.nan)
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Handle chromosome encoding (X=23, Y=24, MT=25)
        def encode_chromosome(chr_val):
            if pd.isna(chr_val):
                return 1  # Default to chromosome 1
            chr_str = str(chr_val).upper().replace('CHR', '')
            if chr_str == 'X':
                return 23
            elif chr_str == 'Y':
                return 24
            elif chr_str in ['MT', 'M']:
                return 25
            else:
                try:
                    return int(float(chr_str))
                except:
                    return 1  # Default fallback
        
        df['Chromosome'] = df['Chromosome'].apply(encode_chromosome)
        
        # Fill missing numeric values with median
        for col in ['Start', 'Stop']:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
        
        # Ensure Start < Stop
        df['Stop'] = np.maximum(df['Stop'], df['Start'] + 1)
        
        print(f"After numeric cleaning: {df.shape}")
        
        # 3. Clean categorical columns
        categorical_columns = ['GeneSymbol', 'ClinicalSignificance', 'Type', 'Assembly']
        
        for col in categorical_columns:
            # Convert to string and handle various null representations
            df[col] = df[col].astype(str)
            df[col] = df[col].replace(['nan', 'NaN', 'None', 'null', ''], 'unknown')
            df[col] = df[col].str.strip()  # Remove whitespace
        
        # 4. Clean and process PhenotypeList
        df['PhenotypeList'] = df['PhenotypeList'].astype(str)
        df['PhenotypeList'] = df['PhenotypeList'].replace(['nan', 'NaN', 'None', 'null', ''], 'not provided')
        
        # Extract diseases from PhenotypeList
        def extract_diseases(phenotype_str):
            if pd.isna(phenotype_str) or str(phenotype_str).lower() in ['nan', 'not provided', 'unknown', '']:
                return []
            
            # Split by | and clean each disease
            diseases = str(phenotype_str).split('|')
            cleaned_diseases = []
            
            for disease in diseases:
                disease = disease.strip()
                if disease and disease.lower() not in ['nan', 'not provided', 'unknown', 'none', '']:
                    cleaned_diseases.append(disease)
            
            return cleaned_diseases
        
        df['Diseases'] = df['PhenotypeList'].apply(extract_diseases)
        
        # Filter out rows with no valid diseases
        print(f"Rows before disease filtering: {len(df)}")
        df = df[df['Diseases'].apply(len) > 0]
        print(f"Rows after disease filtering: {len(df)}")
        
        if len(df) == 0:
            raise ValueError("No rows with valid disease labels found!")
        
        # 5. Filter by clinical significance (keep only pathogenic variants)
        pathogenic_terms = ['pathogenic', 'likely pathogenic', 'pathogenic/likely pathogenic']
        df = df[df['ClinicalSignificance'].str.lower().isin(pathogenic_terms)]
        print(f"Rows after pathogenic filtering: {len(df)}")
        
        if len(df) == 0:
            raise ValueError("No pathogenic variants found!")
        
        # 6. Encode categorical features
        for feature in categorical_columns:
            if feature not in self.label_encoders:
                self.label_encoders[feature] = LabelEncoder()
                df[f'{feature}_encoded'] = self.label_encoders[feature].fit_transform(df[feature])
            else:
                # Handle unseen categories
                unique_values = set(self.label_encoders[feature].classes_)
                df[feature] = df[feature].apply(lambda x: x if x in unique_values else self.label_encoders[feature].classes_[0])
                df[f'{feature}_encoded'] = self.label_encoders[feature].transform(df[feature])
        
        # 7. Create derived features
        df['Length'] = df['Stop'] - df['Start']
        df['Length'] = df['Length'].clip(lower=1)  # Ensure positive lengths
        
        # Normalize genomic positions
        start_min, start_max = df['Start'].min(), df['Start'].max()
        if start_max == start_min:
            df['Position_normalized'] = 0.5
        else:
            df['Position_normalized'] = (df['Start'] - start_min) / (start_max - start_min)
        
        # Store training statistics
        self.training_stats = {
            'start_min': start_min,
            'start_max': start_max,
            'chromosome_range': [df['Chromosome'].min(), df['Chromosome'].max()],
            'length_stats': [df['Length'].min(), df['Length'].max(), df['Length'].mean()]
        }
        
        # 8. Create feature matrix
        self.feature_columns = [
            'Chromosome', 'Position_normalized', 'Length',
            'GeneSymbol_encoded', 'ClinicalSignificance_encoded',
            'Type_encoded', 'Assembly_encoded'
        ]
        
        X = df[self.feature_columns].copy()
        
        # Ensure all features are numeric
        for col in self.feature_columns:
            X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0)
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X = pd.DataFrame(X_scaled, columns=self.feature_columns)
        
        # 9. Prepare multi-label target
        y = self.mlb.fit_transform(df['Diseases'])
        
        print(f"\nFinal preprocessing results:")
        print(f"Feature matrix shape: {X.shape}")
        print(f"Target matrix shape: {y.shape}")
        print(f"Number of unique diseases: {len(self.mlb.classes_)}")
        print(f"Disease classes: {list(self.mlb.classes_)[:10]}...")  # Show first 10
        
        return X, y, df
    
    def simulate_dna_sequence_processing(self, dna_sequence, chromosome=7):
        """
        Simulate processing of DNA sequence to extract variant features
        """
        variants = []
        sequence_length = len(dna_sequence)
        
        # Simulate finding variants based on sequence characteristics
        for i in range(0, sequence_length, 50):  # Check every 50 bases
            if i < sequence_length - 3:
                subsequence = dna_sequence[i:i+4]
                
                # Mock variant based on sequence content
                if any(pattern in subsequence for pattern in ['ATG', 'TAA', 'TGA', 'TAG']):
                    variant = {
                        'Chromosome': float(chromosome),
                        'Start': float(1000000 + i),  # Simulate genomic position
                        'Stop': float(1000000 + i + len(subsequence)),
                        'GeneSymbol': 'SIM_GENE',
                        'ClinicalSignificance': 'Pathogenic',
                        'Type': 'SNP' if len(subsequence) == 1 else 'Indel',
                        'Assembly': 'GRCh38'
                    }
                    variants.append(variant)
        
        return variants
    
    def extract_features_from_sequence(self, dna_sequence):
        """
        Extract features from DNA sequence that match dataset format
        """
        variants = self.simulate_dna_sequence_processing(dna_sequence)
        
        if not variants:
            # Return default features if no variants found
            default_features = [0.0] * len(self.feature_columns)  # All zeros after scaling
            return pd.DataFrame([default_features], columns=self.feature_columns)
        
        features_list = []
        for variant in variants:
            # Safely encode categorical features
            gene_encoded = self._safe_encode('GeneSymbol', variant['GeneSymbol'])
            clinical_encoded = self._safe_encode('ClinicalSignificance', variant['ClinicalSignificance'])
            type_encoded = self._safe_encode('Type', variant['Type'])
            assembly_encoded = self._safe_encode('Assembly', variant['Assembly'])
            
            # Normalize position using training stats
            if self.training_stats['start_max'] == self.training_stats['start_min']:
                position_normalized = 0.5
            else:
                position_normalized = (variant['Start'] - self.training_stats['start_min']) / \
                                    (self.training_stats['start_max'] - self.training_stats['start_min'])
            
            features = [
                float(variant['Chromosome']),
                position_normalized,
                float(variant['Stop'] - variant['Start']),
                float(gene_encoded),
                float(clinical_encoded),
                float(type_encoded),
                float(assembly_encoded)
            ]
            features_list.append(features)
        
        # Create DataFrame and scale features
        features_df = pd.DataFrame(features_list, columns=self.feature_columns)
        features_scaled = self.scaler.transform(features_df)
        return pd.DataFrame(features_scaled, columns=self.feature_columns)
    
    def _safe_encode(self, encoder_name, value):
        """Safely encode categorical values, handling unseen categories"""
        try:
            if encoder_name in self.label_encoders:
                encoder = self.label_encoders[encoder_name]
                if value in encoder.classes_:
                    return encoder.transform([value])[0]
                else:
                    return encoder.transform([encoder.classes_[0]])[0]
            else:
                return 0
        except:
            return 0
    
    def train_model(self, X, y):
        """
        Train the disease prediction model
        """
        print("Training model...")
        
        # Ensure all data is numeric and finite
        X = np.nan_to_num(X.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
        
        if len(X) == 0 or len(y) == 0:
            raise ValueError("No valid training data available")
        
        print(f"Training with {len(X)} samples and {X.shape[1]} features")
        print(f"Target labels shape: {y.shape}")
        
        # Split data for validation
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Use MultiOutputClassifier for multi-label classification
        self.model = MultiOutputClassifier(
            RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        )
        self.model.fit(X_train, y_train)
        
        # Evaluate model
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Model accuracy on test set: {accuracy:.4f}")
        
    def predict_disease(self, dna_sequence):
        """
        Predict diseases from DNA sequence
        """
        if self.model is None:
            return "Model not trained yet!"
        
        try:
            # Extract features from DNA sequence
            features = self.extract_features_from_sequence(dna_sequence)
            features = np.nan_to_num(features.astype(float), nan=0.0, posinf=0.0, neginf=0.0)
            
            # Make prediction
            predictions = self.model.predict(features)
            
            # Convert predictions back to disease names
            predicted_diseases = self.mlb.inverse_transform(predictions)
            
            results = []
            for i, diseases in enumerate(predicted_diseases):
                if diseases:
                    results.append({
                        'variant_index': i,
                        'predicted_diseases': list(diseases),
                        'confidence': 'Medium'
                    })
            
            return results if results else [{'predicted_diseases': ['No pathogenic variants detected'], 'confidence': 'N/A'}]
        
        except Exception as e:
            return f"Error in prediction: {str(e)}"

def main():
    try:
        # Load your dataset
        print("Loading dataset...")
        df = pd.read_csv('variant_sample_25k.csv')
        
        print(f"Loaded dataset with shape: {df.shape}")
        print("\nColumn names:")
        print(list(df.columns))
        
        # Initialize predictor
        predictor = GeneticDiseasePredictor()
        
        # Clean and preprocess dataset
        X, y, processed_df = predictor.clean_and_preprocess_dataset(df)
        
        # Train model
        predictor.train_model(X, y)
        
        print("\nModel training completed successfully!")
        
        # Test with sample DNA sequence
        sample_dna = "ATGCGATCGATGCTAGCTAGCTAGCTAGCATGCGATCGATGCTAATAAATGCGATCGATGCTAGCTTAG"
        
        print(f"\nTesting with sample DNA sequence (length: {len(sample_dna)})...")
        
        results = predictor.predict_disease(sample_dna)
        
        print("\nPrediction Results:")
        if isinstance(results, str):
            print(results)
        else:
            for i, result in enumerate(results):
                print(f"Result {i+1}:")
                print(f"  Predicted diseases: {result['predicted_diseases']}")
                print(f"  Confidence: {result['confidence']}")
        
        # Show some dataset statistics
        print(f"\nDataset Statistics:")
        print(f"Total samples after cleaning: {len(processed_df)}")
        print(f"Unique genes: {processed_df['GeneSymbol'].nunique()}")
        print(f"Unique disease categories: {len(predictor.mlb.classes_)}")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print("Please check:")
        print("1. File path is correct")
        print("2. CSV file has the required columns")
        print("3. Data quality and format")

if __name__ == "__main__":
    main()


Loading dataset...
Loaded dataset with shape: (25000, 10)

Column names:
['ReferenceAllele', 'AlternateAllele', 'Chromosome', 'Start', 'Stop', 'GeneSymbol', 'ClinicalSignificance', 'PhenotypeList', 'Type', 'Assembly']
Starting dataset preprocessing...
Initial dataset shape: (25000, 10)

Initial data types:
ReferenceAllele         object
AlternateAllele         object
Chromosome              object
Start                    int64
Stop                     int64
GeneSymbol              object
ClinicalSignificance    object
PhenotypeList           object
Type                    object
Assembly                object
dtype: object

Missing values per column:
ReferenceAllele         0
AlternateAllele         0
Chromosome              0
Start                   0
Stop                    0
GeneSymbol              0
ClinicalSignificance    0
PhenotypeList           0
Type                    0
Assembly                0
dtype: int64
After numeric cleaning: (25000, 10)
Rows before disease filtering: 