In [13]:
class Car:
    def __init__(self,windows,doors,enginetype):
        self.windows=windows
        self.doors=doors
        self.enginetype=enginetype
    
    def drive(self):
        print(f"The person will drive the {self.enginetype} car ")
        # return

In [15]:
car1=Car(4,4,"diesel")
car1.drive()

The person will drive the diesel car 


In [22]:
class Tesla(Car):
    def __init__(self,windows,doors,enginetype,is_selfdriving):
        super().__init__(windows,doors,enginetype)
        self.is_selfdriving=is_selfdriving

    def selfdriving(self):
        print(f"Tesla supports self driving : {self.is_selfdriving}")

In [23]:
tesla1=Tesla(4,5,"electric",True)
tesla1.selfdriving()

Tesla supports self driving : True


In [None]:
import os
import gc
import torch
import sys
from transformers import AutoModelForCausalLM, AutoTokenizer
# moving files
! cp /kaggle/input/deepseekcodingpatternpipeline/other/default/3/* /kaggle/working
import sys
sys.path.append('/kaggle/working/DeepseekCodingPatternPipeline_v3.py')
from DeepseekCodingPatternPipeline_v3 import DeepseekCodingPatternPipeline_v3
if __name__ == "__main__":
    import torch
    from IPython.display import display, Markdown, HTML

    # Clear GPU memory to free up unused allocations.
    torch.cuda.empty_cache()

    # Define the model folder (adjust the path as needed)
    model_folder = "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-7b/1"
    
    # Instantiate the specialized pipeline.
    # If your pipeline supports device selection, force CPU to avoid GPU OOM.
    pipeline = DeepseekCodingPatternPipeline_v3(model_folder, device="cpu")
    
    # The existing code to be optimized (as a multiline string).
    existing_code = """
import h5py
import numpy as np
import cudf
import cupy as cp
import pandas as pd
import matplotlib.pyplot as plt

from cuml.preprocessing import MinMaxScaler
from cuml.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.isotonic import IsotonicRegression

class CellTypeGPUPipeline:
    def __init__(self, h5_file_path):
        self.h5_file_path = h5_file_path
        self.train_spot_tables = {}
        self.cell_type_columns = None

    def load_train_data(self):
        with h5py.File(self.h5_file_path, "r") as f:
            train_spots = f["spots/Train"]
            for slide_name in train_spots.keys():
                spot_array = np.array(train_spots[slide_name])
                df = pd.DataFrame(spot_array)
                self.train_spot_tables[slide_name] = df
        print("Training data loaded (GPU version) successfully.")

    def prepare_training_set(self, slide_id='S_1'):
        if slide_id not in self.train_spot_tables:
            raise ValueError(f"Slide {slide_id} not found in training data.")
        df = self.train_spot_tables[slide_id]
        feature_cols = ['x', 'y']
        target_cols = [col for col in df.columns if col not in feature_cols]
        self.cell_type_columns = target_cols
        X = df[feature_cols].astype('float32')
        y = df[target_cols].astype('float32')
        return X, y

    def perform_eda(self, slide_id='S_1'):
        if slide_id not in self.train_spot_tables:
            raise ValueError(f"Slide {slide_id} not found in training data.")
        df = self.train_spot_tables[slide_id]
        print("Descriptive statistics for slide", slide_id)
        print(df.describe())
        df[['x', 'y']].hist(bins=30, figsize=(12, 5))
        plt.suptitle("Histograms of 'x' and 'y' features")
        plt.show()

def load_test_data(h5_file_path, slide_id):
    with h5py.File(self.h5_file_path, "r") as f:
        test_spots = f["spots/Test"]
        if slide_id not in test_spots:
            raise ValueError(f"Slide {slide_id} not found in test data.")
        spot_array = np.array(test_spots[slide_id])
        df = pd.DataFrame(spot_array)
    print(f"Test data for slide {slide_id} loaded successfully.")
    return df

def create_submission(test_df, predictions, submission_filename, cell_type_columns):
    pred_df = pd.DataFrame(predictions, columns=cell_type_columns, index=test_df.index)
    pred_df.insert(0, 'ID', test_df.index)
    pred_df.to_csv(submission_filename, index=False)
    print(f"Submission file '{submission_filename}' created!")

def calibrate_model(best_model, X_train_np, y_train_np):
    y_train_pred = best_model.predict(X_train_np)
    n_targets = y_train_np.shape[1]
    calibrators = []
    for i in range(n_targets):
        iso = IsotonicRegression(out_of_bounds='clip')
        iso.fit(y_train_pred[:, i], y_train_np[:, i])
        calibrators.append(iso)
    return calibrators

def apply_calibration(predictions, calibrators):
    n_targets = predictions.shape[1]
    calibrated_preds = np.zeros_like(predictions)
    for i in range(n_targets):
        calibrated_preds[:, i] = calibrators[i].predict(predictions[:, i])
    return calibrated_preds

if __name__ == "__main__":
    h5_file_path = "/kaggle/input/el-hackathon-2025/elucidata_ai_challenge_data.h5"
    pipeline_obj = CellTypeGPUPipeline(h5_file_path)
    pipeline_obj.load_train_data()
    pipeline_obj.perform_eda(slide_id='S_1')
    X_train, y_train = pipeline_obj.prepare_training_set(slide_id='S_1')
    if hasattr(X_train, 'to_pandas'):
        X_train_np = X_train.to_pandas().values
    else:
        X_train_np = X_train.values
    if hasattr(y_train, 'to_pandas'):
        y_train_np = y_train.to_pandas().values
    else:
        y_train_np = y_train.values
    pipe = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', MultiOutputRegressor(RandomForestRegressor(random_state=42)))
    ])
    param_grid = {
        'model__estimator__n_estimators': [50, 100, 200],
        'model__estimator__max_depth': [None, 10, 20],
        'model__estimator__min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
    grid_search.fit(X_train_np, y_train_np)
    best_params_grid = grid_search.best_params_
    best_score_grid = -grid_search.best_score_
    print("Best parameters from GridSearchCV:", best_params_grid)
    print("Best CV MSE from GridSearchCV:", best_score_grid)
    random_search = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error',
                                         n_iter=5, random_state=42, verbose=2)
    random_search.fit(X_train_np, y_train_np)
    best_params_random = random_search.best_params_
    best_score_random = -random_search.best_score_
    print("Best parameters from RandomizedSearchCV:", best_params_random)
    print("Best CV MSE from RandomizedSearchCV:", best_score_random)
    best_model = grid_search.best_estimator_
    calibrators = calibrate_model(best_model, X_train_np, y_train_np)
    test_df = load_test_data(h5_file_path, slide_id='S_7')
    X_test = test_df[['x', 'y']].astype('float32')
    X_test_np = X_test.values
    test_predictions = best_model.predict(X_test_np)
    calibrated_predictions = apply_calibration(test_predictions, calibrators)
    create_submission(test_df, calibrated_predictions, submission_filename="submission.csv",
                      cell_type_columns=pipeline_obj.cell_type_columns)
"""
    # Use a reduced max_new_tokens value (e.g. 512) to minimize memory usage during generation.
    optimized_code = pipeline.optimize_code(existing_code, max_new_tokens=1024)
    
    display(Markdown("### Optimized Code (Markdown)"))
    display(Markdown(pipeline.format_markdown(optimized_code)))

'''<think>
Okay, so I need to optimize the given code for better performance and readability. Let me look through the code step by step to identify areas that can be improved.

First, I notice that the code imports several libraries like h5py, numpy, cudf, cupy, pandas, and matplotlib. But looking at the code, it's using pandas for data loading and preprocessing, which might not be the most efficient for GPU operations. Maybe switching to cudf for data manipulation could improve performance since it's designed for GPU acceleration.

Next, the CellTypeGPUPipeline class has an __init__ method that takes h5_file_path and initializes some attributes. The load_train_data method uses h5py to load training data into DataFrames. However, loading data in a loop can be slow. Perhaps using a generator or loading all data at once could be more efficient. Also, the code uses df = pd.DataFrame(spot_array), which creates a DataFrame for each slide. Maybe using a more memory-efficient structure or reading all data together would help.

In prepare_training_set, the code extracts feature_cols and target_cols. The use of df[feature_cols] and df[target_cols] can be converted to cupy arrays for GPU processing, which would speed things up. Also, the code later uses X_train.values and y_train.values, which copies data to CPU memory. If we keep everything on the GPU, we can avoid these copies.

The perform_eda method uses pandas and matplotlib, which are CPU-bound. If the data is already loaded onto the GPU, maybe we should process it there instead of bringing it back to the CPU for plotting. This would reduce memory transfers and speed up the EDA.

Looking at the load_test_data function, it's similar to load_train_data but doesn't specify the slide_id in the with block. That's a bit odd. It should probably include the slide_id when opening the file. Also, returning the DataFrame without specifying the index might not be optimal, but it's not a critical issue here.

In create_submission, the code creates a DataFrame with predictions and writes it to a CSV. This is CPU operations again. If the predictions are already on the GPU, we should write them directly without transferring data back to the CPU.

The calibrate_model and apply_calibration functions use IsotonicRegression, which is part of scikit-learn. These functions are called after the model is trained on CPU data. If the model training can be done on the GPU, we should consider moving that part as well. However, scikit-learn's MultiOutputRegressor and IsotonicRegression might not support GPU acceleration, so this might not be feasible. Alternatively, we can keep the calibration on the CPU since it's a post-processing step.

The apply_calibration function is written in pure Python with loops, which is not efficient. Using vectorized operations with cupy arrays would be much faster. Instead of looping through each target, we can process all targets at once using cupy's capabilities.

The GridSearchCV and RandomizedSearchCV are used for hyperparameter tuning. These are CPU-bound operations as well. If the model training can be accelerated on the GPU, we should parallelize the search. However, scikit-learn's GridSearchCV doesn't support GPU acceleration, so we might need to implement a custom search or use a different approach.

In the main execution block, the code converts X_train and y_train to NumPy arrays. This is unnecessary if we're keeping everything on the GPU. Instead, we can use cupy arrays throughout. Also, the code uses h5py to load data, which is slow for large datasets. Switching to a library that handles data more efficiently on the GPU would be better.

Another point is that the code uses print statements extensively for output. While this is fine for small datasets, for larger datasets, it can slow down the code. Using context managers or suppressing print statements where not needed could improve performance.

Looking at the variable names and code structure, some are a bit messy. For example, the pipeline_obj is created but not used beyond that. Also, the code has several redundant imports, like matplotlib, which might not be used.

To summarize, the main areas for optimization are:

1. **Data Loading and Conversion**: Use cudf for in-memory data loading and manipulation to leverage GPU acceleration. Convert data to cupy arrays earlier to avoid repeated CPU-GPU transfers.

2. **Preprocessing and Feature Engineering**: Process data on the GPU using cupy functions instead of pandas and matplotlib for EDA.

3. **Model Training and Calibration**: Move as much as possible to the GPU. However, since some parts (like calibration) are CPU-bound, focus on GPU-accelerated parts.

4. **Optimization Techniques**: Use batch processing, vectorized operations, and minimize data transfers between CPU and GPU.

5. **Code Structure and Redundancies**: Remove unnecessary code, imports, and redundant variables to clean up the code.

6. **Performance Impro
 
'''