## **A. Importing and INstalling required libraries**

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import joblib
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_selection import SelectFromModel

## **B. Uploading Datasets**

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_labels_df = pd.read_csv('train_labels.csv')

## **C. Explore Data**

In [None]:
print('Train DataFrame Info:')
train_df.info()
print('\nTest DataFrame Info:')
test_df.info()
print('\nTrain Labels DataFrame Info:')
train_labels_df.info()

print('\nTrain DataFrame Head:')
print(train_df.head())
print('\nTest DataFrame Head:')
print(test_df.head())
print('\nTrain Labels DataFrame Head:')
print(train_labels_df.head())

print('\nMissing values in train_df (Class column):')
print(train_df['Class'].isnull().sum())

print('\nClass distribution in train_labels_df:')
print(train_labels_df['Class'].value_counts())

Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Columns: 14574 entries, Id to Class
dtypes: float64(14573), object(1)
memory usage: 44.5+ MB

Test DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401 entries, 0 to 400
Columns: 14573 entries, Id to gene_20642
dtypes: float64(14572), object(1)
memory usage: 44.6+ MB

Train Labels DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Id      150 non-null    object
 1   Class   150 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 2.5+ KB

Train DataFrame Head:
           Id    gene_1    gene_3    gene_5    gene_7    gene_8    gene_9  \
0  sample_664  0.160738 -0.327348 -0.144638  0.196493 -1.105093  0.309926   
1  sample_215 -0.771173  0.885819 -0.234209  0.273139  0.132208 -0.249541   
2  sample_343 -0.169258  1.908618  0.16

## **D. Preprocess Data**

In [None]:
# Merge train_df with train_labels_df to get the true labels for the labeled samples
train_df = pd.merge(train_df, train_labels_df, on='Id', how='left', suffixes=('_drop', None))
train_df.drop(columns=['Class_drop'], inplace=True)

# Separate features and target variable
X_train = train_df.drop(columns=['Id', 'Class'])
y_train = train_df['Class']
X_test = test_df.drop(columns=['Id'])

# Impute missing values in gene features using the mean strategy
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_test_imputed_df = pd.DataFrame(X_test_imputed, columns=X_test.columns)

# Save preprocessed data
X_train_imputed_df.to_csv('X_train_preprocessed.csv', index=False)
y_train.to_csv('y_train_preprocessed.csv', index=False)
X_test_imputed_df.to_csv('X_test_preprocessed.csv', index=False)

print('Data preprocessing complete. Saved X_train_preprocessed.csv, y_train_preprocessed.csv, and X_test_preprocessed.csv')

Data preprocessing complete. Saved X_train_preprocessed.csv, y_train_preprocessed.csv, and X_test_preprocessed.csv


## **E. Semi-Supervised Model**

In [None]:
# Combine X_train and y_train for easier splitting into labeled/unlabeled
train_combined = X_train_imputed_df.copy()
train_combined["Class"] = y_train

# Separate labeled and unlabeled data
labeled_data = train_combined[train_combined["Class"].notna()]
unlabeled_data = train_combined[train_combined["Class"].isna()]

X_labeled = labeled_data.drop(columns=["Class"])
y_labeled = labeled_data["Class"]
X_unlabeled = unlabeled_data.drop(columns=["Class"])

# Base estimator for SelfTrainingClassifier
base_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Self-training classifier
# verbose=True to see the progress of self-training
self_training_model = SelfTrainingClassifier(base_classifier, threshold=0.7, criterion="threshold", verbose=True)

# Fit the model on labeled and unlabeled data
# For SelfTrainingClassifier, X should contain both labeled and unlabeled samples,
# and y should contain labels for labeled samples and -1 for unlabeled samples.

X_train_full = pd.concat([X_labeled, X_unlabeled], ignore_index=True)
y_train_full = pd.concat([y_labeled, pd.Series([-1] * len(X_unlabeled))], ignore_index=True)

self_training_model.fit(X_train_full, y_train_full)

# Save the trained model
joblib.dump(self_training_model,
'/tmp/self_training_model.pkl'
)

print('Semi-supervised model training complete. Model saved as self_training_model.pkl')

End of iteration 1, added 43 new labels.
End of iteration 2, added 33 new labels.
End of iteration 3, added 34 new labels.
End of iteration 4, added 15 new labels.
End of iteration 5, added 6 new labels.
End of iteration 6, added 3 new labels.
End of iteration 7, added 7 new labels.
End of iteration 8, added 4 new labels.
End of iteration 9, added 1 new labels.
Semi-supervised model training complete. Model saved as self_training_model.pkl


## **F. Optimize Model**

In [15]:
# Combine X_train and y_train for easier splitting into labeled/unlabeled
train_combined = X_train_imputed_df.copy()
train_combined["Class"] = y_train

# Separate labeled and unlabeled data
labeled_data = train_combined[train_combined["Class"].notna()]
unlabeled_data = train_combined[train_combined["Class"].isna()]

X_labeled = labeled_data.drop(columns=["Class"])
y_labeled = labeled_data["Class"]
X_unlabeled = unlabeled_data.drop(columns=["Class"])

# Feature Selection using RandomForestClassifier importance
print("Performing feature selection...")
feature_selector = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
feature_selector.fit(X_labeled, y_labeled)

# Select features based on importance (e.g., features with importance greater than the mean)
model_selector = SelectFromModel(feature_selector, prefit=True, threshold="mean")

X_labeled_selected = model_selector.transform(X_labeled)
X_unlabeled_selected = model_selector.transform(X_unlabeled)
X_test_selected = model_selector.transform(X_test_imputed_df)

print(f"Number of features after selection: {X_labeled_selected.shape[1]}")

# Base estimator for SelfTrainingClassifier
base_classifier = RandomForestClassifier(random_state=42, n_jobs=-1)

# Define a more extensive parameter grid for RandomForestClassifier
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use StratifiedKFold for cross-validation to maintain class distribution
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV(
    base_classifier,
    param_grid_rf,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

print("Starting GridSearchCV for RandomForestClassifier...")
grid_search_rf.fit(X_labeled_selected, y_labeled)

print("Best parameters for RandomForestClassifier:", grid_search_rf.best_params_)
print("Best accuracy for RandomForestClassifier:", grid_search_rf.best_score_)

# Now, train the SelfTrainingClassifier with the best base estimator and selected features
best_rf_estimator = grid_search_rf.best_estimator_

final_self_training_model = SelfTrainingClassifier(best_rf_estimator, threshold=0.7, criterion="threshold", verbose=True)

# Prepare data for final self-training with selected features
X_train_full_selected = pd.concat([pd.DataFrame(X_labeled_selected), pd.DataFrame(X_unlabeled_selected)], ignore_index=True)
y_train_full = pd.concat([y_labeled, pd.Series([-1] * len(X_unlabeled))], ignore_index=True)

# Impute missing values in the combined data before training
imputer_final = SimpleImputer(strategy='mean')
X_train_full_selected_imputed = imputer_final.fit_transform(X_train_full_selected)


print("Starting final SelfTrainingClassifier fit...")
final_self_training_model.fit(X_train_full_selected_imputed, y_train_full)

# Save the final optimized model and the feature selector
joblib.dump(final_self_training_model,
'/tmp/optimized_self_training_model_v2.pkl'
)
joblib.dump(model_selector,
'/tmp/feature_selector.pkl'
)

print(
'Optimized semi-supervised model training complete (v2). Model saved as optimized_self_training_model_v2.pkl and feature_selector.pkl'
)

Performing feature selection...




Number of features after selection: 1372
Starting GridSearchCV for RandomForestClassifier...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters for RandomForestClassifier: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy for RandomForestClassifier: 0.9866666666666667
Starting final SelfTrainingClassifier fit...
End of iteration 1, added 130 new labels.
End of iteration 2, added 44 new labels.
End of iteration 3, added 15 new labels.
End of iteration 4, added 8 new labels.
End of iteration 5, added 4 new labels.
End of iteration 6, added 4 new labels.
End of iteration 7, added 3 new labels.
End of iteration 8, added 4 new labels.
Optimized semi-supervised model training complete (v2). Model saved as optimized_self_training_model_v2.pkl and feature_selector.pkl


## **G. Generate Submission**

In [18]:
# Load the optimized trained model and the feature selector
final_self_training_model = joblib.load('/tmp/optimized_self_training_model_v2.pkl')
model_selector = joblib.load('/tmp/feature_selector.pkl')

# Apply the same feature selection to the test data
X_test_selected = model_selector.transform(X_test_imputed_df)

# Impute missing values in the selected test data if any
imputer_test = SimpleImputer(strategy='mean')
X_test_selected_imputed = imputer_test.fit_transform(X_test_selected)

# Make predictions on the test data
test_predictions = final_self_training_model.predict(X_test_selected_imputed)

# Create a submission DataFrame
submission_df = pd.DataFrame({'Id': test_df['Id'], 'Class': test_predictions})

# Ensure the predicted classes are integers
submission_df['Class'] = submission_df['Class'].astype(int)

# Save the submission file
submission_df.to_csv('submission.csv', index=False)

print("Submission file created successfully with optimized model!")

Submission file created successfully with optimized model!


