In [1]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Read and preprocess training data
train_data = pd.read_excel('01 Train Data.xlsx')

# Drop unnecessary columns and handle missing values
train_data.drop(['Price Tier', 'Group'], axis=1, inplace=True)
train_data.drop_duplicates(subset='Email ID', keep='first', inplace=True)

# Concatenate columns and drop originals
train_data['Source of Event Information'] = train_data['How did you come to know about this event?'].fillna('') + ' ' + train_data['Specify in "Others" (how did you come to know about this event)'].fillna('')
train_data.drop(['How did you come to know about this event?', 'Specify in "Others" (how did you come to know about this event)'], axis=1, inplace=True)

# Encode categorical variables
encoder = LabelEncoder()
train_data['Placement Status'] = encoder.fit_transform(train_data['Placement Status'].fillna('Not Placed'))
train_data['College Name'] = encoder.fit_transform(train_data['College Name'].fillna('Unknown'))

# Prepare features and target variable
X_train = train_data[['College Name', 'CGPA', 'Speaking Skills', 'ML Knowledge']]
y_train = train_data['Placement Status']

# Step 2: Read and preprocess test data
test_data = pd.read_excel('02 Test Data.xlsx')

# Drop unnecessary columns and handle missing values
test_data.drop(['Price Tier', 'Group', 'Quantity', 'Attendee #', 'Order Type', 'Currency', 'Total Paid', 'Fees Paid',
                'Eventbrite Fees', 'Eventbrite Payment Processing', 'Attendee Status'], axis=1, inplace=True)
test_data.drop_duplicates(subset='Email ID', keep='first', inplace=True)

# Concatenate columns and drop originals
test_data['Source of Event Information'] = test_data['How did you come to know about this event?'].fillna('') + ' ' + test_data['Specify in "Others" (how did you come to know about this event)'].fillna('')
test_data.drop(['How did you come to know about this event?', 'Specify in "Others" (how did you come to know about this event)'], axis=1, inplace=True)

# Encode categorical variables
test_data['College Name'] = encoder.transform(test_data['College Name'].fillna('Unknown'))

# Prepare test features
X_test = test_data[['College Name', 'CGPA', 'Speaking Skills', 'ML Knowledge']]

# Step 3: Train the AdaBoost model
model = AdaBoostClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

# Step 4: Predict on the test data
y_test_pred = model.predict(X_test)

# Step 5: Save predicted results to the test data
test_data['Predicted Placement Status'] = y_test_pred

# Decode predicted placement status
pred_placement_status_decode_mapping = {
    0: 'Not Placed',
    1: 'Placed'
}
test_data['Predicted Placement Status'] = test_data['Predicted Placement Status'].map(pred_placement_status_decode_mapping).fillna(' ')

# Step 6: Save final predicted data
final_predicted_data = test_data[['First Name', 'Email ID', 'Ticket Type', 'College Name', 'Designation', 'Year of Graduation',
                                  'CGPA', 'Speaking Skills', 'ML Knowledge', 'Predicted Placement Status',
                                  'Source of Event Information']]

# Save to Excel and CSV
final_predicted_data.to_excel('Final Placement Predicted Data.xlsx', index=False)
final_predicted_data.to_csv('Final Placement Predicted Data.csv', index=False)

# Optional: Compute accuracy on validation set if needed
# Split data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Train the model
model.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = model.predict(X_val_split)

# Compute accuracy
accuracy = accuracy_score(y_val_split, y_val_pred)
print(f"Validation Accuracy: {accuracy:.2f}")


Validation Accuracy: 0.73
