In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

In [2]:
training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

print(training_data.head())

   ID             Candidate        Constituency ∇ Party  Criminal Case  \
0   0            M.K. Mohan            ANNA NAGAR   DMK              4   
1   1  Khatik Ramesh Prasad           KARERA (SC)   BJP              0   
2   2      Dr. Mantar Gowda              MADIKERI   INC              0   
3   3          Kundan Kumar             BEGUSARAI   BJP              0   
4   4       Swapan Majumder  BANGAON DAKSHIN (SC)   BJP              2   

  Total Assets Liabilities           state      Education  
0   211 Crore+    2 Crore+      TAMIL NADU       8th Pass  
1     1 Crore+           0  MADHYA PRADESH      12th Pass  
2     7 Crore+     22 Lac+       KARNATAKA  Post Graduate  
3     9 Crore+     24 Lac+           BIHAR  Post Graduate  
4     2 Crore+     61 Lac+     WEST BENGAL       8th Pass  


In [3]:

def financial_values_to_integers(value):
    units = {'Hund+': 100, 'Thou+': 1000, 'Lac+': 100000, 'Crore+': 1000000}
    if isinstance(value, str):
        for unit, multiplier in units.items():
            if unit in value:
                return int(value.replace(unit, '')) * multiplier
    return int(value)

def preprocess_data(data):
    for column in ['Total Assets', 'Liabilities']:
        data[column] = data[column].apply(financial_values_to_integers)
    data['Candidate'] = data['Candidate'].str.split().str[-1]
    return data

processed_training_data = preprocess_data(training_data)
processed_test_data = preprocess_data(test_data)
print(training_data.head())

   ID Candidate        Constituency ∇ Party  Criminal Case  Total Assets  \
0   0     Mohan            ANNA NAGAR   DMK              4     211000000   
1   1    Prasad           KARERA (SC)   BJP              0       1000000   
2   2     Gowda              MADIKERI   INC              0       7000000   
3   3     Kumar             BEGUSARAI   BJP              0       9000000   
4   4  Majumder  BANGAON DAKSHIN (SC)   BJP              2       2000000   

   Liabilities           state      Education  
0      2000000      TAMIL NADU       8th Pass  
1            0  MADHYA PRADESH      12th Pass  
2      2200000       KARNATAKA  Post Graduate  
3      2400000           BIHAR  Post Graduate  
4      6100000     WEST BENGAL       8th Pass  


In [4]:
imputer = SimpleImputer(strategy='mean')
processed_training_data[['Total Assets', 'Liabilities']] = imputer.fit_transform(processed_training_data[['Total Assets', 'Liabilities']])
processed_test_data[['Total Assets', 'Liabilities']] = imputer.transform(processed_test_data[['Total Assets', 'Liabilities']])

label_encoder = LabelEncoder()

columns_to_encode = ['Party', 'state', 'Candidate']

for column in columns_to_encode:
    processed_training_data[column] = label_encoder.fit_transform(processed_training_data[column])
    processed_test_data[column] = label_encoder.fit_transform(processed_test_data[column])

processed_training_data.reset_index(drop=True, inplace=True)
processed_test_data.reset_index(drop=True, inplace=True)

print(processed_training_data['Education'].unique())
print(np.unique(processed_training_data['Party']))
print(np.unique(processed_training_data['state']))
print(np.unique(processed_training_data['Candidate']))
print(np.unique(processed_training_data['Total Assets']))

['8th Pass' '12th Pass' 'Post Graduate' 'Graduate Professional' 'Graduate'
 '10th Pass' 'Others' 'Doctorate' 'Literate' '5th Pass']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27]
[   0    1    2 ... 1213 1214 1215]
[0.000e+00 1.500e+04 1.800e+04 2.400e+04 3.000e+04 5.100e+04 7.200e+04
 7.300e+04 1.000e+05 2.000e+05 3.000e+05 4.000e+05 5.000e+05 6.000e+05
 8.000e+05 9.000e+05 1.000e+06 1.100e+06 1.200e+06 1.300e+06 1.400e+06
 1.500e+06 1.600e+06 1.700e+06 1.800e+06 1.900e+06 2.000e+06 2.100e+06
 2.200e+06 2.300e+06 2.400e+06 2.500e+06 2.600e+06 2.700e+06 2.800e+06
 2.900e+06 3.000e+06 3.100e+06 3.200e+06 3.300e+06 3.400e+06 3.500e+06
 3.600e+06 3.700e+06 3.800e+06 3.900e+06 4.000e+06 4.100e+06 4.200e+06
 4.300e+06 4.400e+06 4.500e+06 4.600e+06 4.700e+06 4.900e+06 5.000e+06
 5.100e+06 5.200e+06 5.300e+06 5.400e+06 5.500e+06 5.600e+06 5.700e+06
 5.800e+06 5.900e+06 6.000e+06 6.10

In [5]:
model_features = ['Criminal Case', 'Total Assets', 'Liabilities', 'Party', 'state', 'Candidate']

x_output = processed_test_data[model_features]
x = processed_training_data[model_features]
y = processed_training_data['Education']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

rf_classifier = RandomForestClassifier(random_state=0)
param_grid = {
    'bootstrap': [True],
    'max_depth': [20,30,40,50,60,70,80, 90, 100, 110],
    'min_samples_split': [8, 10, 12,15],
    'n_estimators': [20,40,50,60,80,100, 200, 300, 1000]
}
grid_search = GridSearchCV(estimator = rf_classifier, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


In [7]:
y_pred = best_model.predict(x_test)

print(classification_report(y_test, y_pred, zero_division=1))

                       precision    recall  f1-score   support

            10th Pass       0.50      0.03      0.06        63
            12th Pass       0.23      0.13      0.17        86
             5th Pass       1.00      0.00      0.00         1
             8th Pass       0.00      0.00      0.00        24
            Doctorate       1.00      0.00      0.00        14
             Graduate       0.31      0.54      0.39       136
Graduate Professional       0.28      0.17      0.22        86
             Literate       1.00      0.00      0.00         3
               Others       1.00      0.00      0.00         8
        Post Graduate       0.18      0.32      0.23        94

             accuracy                           0.26       515
            macro avg       0.55      0.12      0.11       515
         weighted avg       0.31      0.26      0.22       515



In [8]:
predictions = best_model.predict(x_output)
result = pd.DataFrame(predictions, columns=['Education'])
result.insert(0, 'ID', range(0, len(result)))
result.to_csv('output_final.csv', index=False)