In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
#dataset of 2009- part1
df09_1 =pd.read_csv('/content/LS2009Candidate.csv')

In [None]:
df09_1.head(3)

Unnamed: 0,STATE,YEAR,CONSTITUENCY,NAME,GENDER,CATEGORY,AGE,PARTY,TOTAL_VOTES,WINNER
0,Andhra Pradesh,2009,Adilabad,RATHOD RAMESH,M,ST,43.0,TDP,372268.0,1.0
1,Andhra Pradesh,2009,Adilabad,KOTNAK RAMESH,M,ST,39.0,INC,257181.0,0.0
2,Andhra Pradesh,2009,Adilabad,MESRAM NAGO RAO,M,ST,59.0,PRAP,112930.0,0.0


In [None]:
#deleting unnecessary columns
df09_1.drop(columns =['ST_CODE','Month','PC Type','PC Number'],inplace=True)
#renaming columns
df09_1.rename(columns={'State name':'STATE','Year':'YEAR','PC name':'CONSTITUENCY','Candidate Name':'NAME',
                       'Candidate Sex':'GENDER', 'Candidate Category':'CATEGORY', 'Candidate Age':'AGE', 
                       'Party Abbreviation':'PARTY', 'Total Votes Polled':'TOTAL_VOTES','Position':'WINNER'},inplace=True)



In [None]:
#modifying winner column to display 1 for winner, and 0 non-winner
for j in range(df09_1.shape[0]):
  if(df09_1.iat[j,9] != 1):
    df09_1.iat[j,9] = 0


In [None]:
#dataset of 2009- part2
df09_2 =pd.read_csv('/content/LokSabha2009.csv')

In [None]:
df09_2.head(3)

Unnamed: 0,NAME,CRIMINAL_CASES,ASSETS,LIABILITIES
0,SHRI BISHNU PADA RAY,1,1241034.0,0
1,SHRI P. R. GANESHAN,0,329563.0,0
2,SHRI PRADEEP KUMAR EKKA,0,410000.0,0


In [None]:
#deleting unnecessary columns
df09_2.drop(columns =['Party','Education','Age','Constituency','Winner','Gender'],inplace=True)
#renaming columns
df09_2.rename(columns={'Candidate':'NAME','Criminal Cases':'CRIMINAL_CASES',
                       'Total Assets':'ASSETS','Liabilities':'LIABILITIES'},inplace=True)

In [None]:

df09_2['NAME'] = df09_2['NAME'].str.upper()
#Merging the two datasets on left join and droping duplicate values
df1  = pd.merge(df09_1,df09_2,on= 'NAME',how= 'left')
df1.drop_duplicates(['NAME'],keep ='first',inplace=True)

In [None]:
df1.head(3)


Unnamed: 0,STATE,YEAR,CONSTITUENCY,NAME,GENDER,CATEGORY,AGE,PARTY,TOTAL_VOTES,WINNER,CRIMINAL_CASES,ASSETS,LIABILITIES
0,Andhra Pradesh,2009,Adilabad,RATHOD RAMESH,M,ST,43.0,TDP,372268.0,1.0,,,
1,Andhra Pradesh,2009,Adilabad,KOTNAK RAMESH,M,ST,39.0,INC,257181.0,0.0,0.0,2600000.0,0.0
4,Andhra Pradesh,2009,Adilabad,MESRAM NAGO RAO,M,ST,59.0,PRAP,112930.0,0.0,,,


In [None]:
df4 =pd.read_csv("/content/kaggle_2019.csv")
df3 = df4.copy()
df3.head()

In [None]:
#delete unnecessary column
df3.drop(columns =['GENERAL\nVOTES', 'POSTAL\nVOTES', 
                          'OVER TOTAL ELECTORS \nIN CONSTITUENCY', 'OVER TOTAL VOTES POLLED \nIN CONSTITUENCY', 
                          'TOTAL ELECTORS'],inplace=True)
# rename invalid column names
df3.rename(columns={'CRIMINAL\nCASES': 'CRIMINAL_CASES', 'GENERAL\nVOTES': 'GENERAL_VOTES', 
                          'POSTAL\nVOTES': 'POSTAL_VOTES', 'TOTAL\nVOTES': 'TOTAL_VOTES', 
                          'OVER TOTAL ELECTORS \nIN CONSTITUENCY': 'OVER_TOTAL_ELECTORS_IN_CONSTITUENCY', 
                          'OVER TOTAL VOTES POLLED \nIN CONSTITUENCY': 'OVER_TOTAL_VOTES_POLLED_IN_CONSTITUENCY', 
                          'TOTAL ELECTORS': 'TOTAL_ELECTORS'},inplace=True)
df3['YEAR']=2019
df3 =df3.reindex(columns=['STATE','YEAR','CONSTITUENCY','NAME','GENDER','CATEGORY','AGE','PARTY','TOTAL_VOTES','WINNER','CRIMINAL_CASES','ASSETS','LIABILITIES'])


In [None]:
df3.head(3)

Unnamed: 0,STATE,YEAR,CONSTITUENCY,NAME,GENDER,CATEGORY,AGE,PARTY,TOTAL_VOTES,WINNER,CRIMINAL_CASES,ASSETS,LIABILITIES
0,Telangana,2019,ADILABAD,SOYAM BAPU RAO,MALE,ST,52.0,BJP,377374,1,52,3099414,231450
1,Telangana,2019,ADILABAD,Godam Nagesh,MALE,ST,54.0,TRS,318814,0,0,18477888,847000
2,Telangana,2019,ADILABAD,RATHOD RAMESH,MALE,ST,52.0,INC,314238,0,3,36491000,15300000


In [None]:
def value_cleaner(x):
      try:
        str_temp = (x.split('Rs')[1].split('\n')[0].strip())
        str_temp_2 = ''
        for i in str_temp.split(","):
            str_temp_2 = str_temp_2+i
        return str_temp_2
      except:
        x = 0
        return x

df3['ASSETS'] = df3['ASSETS'].apply((value_cleaner))
df3['LIABILITIES'] = df3['LIABILITIES'].apply((value_cleaner))


In [None]:
dataset = pd.concat([df1,df2,df3])

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17716 entries, 0 to 17715
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   STATE           17716 non-null  int64
 1   YEAR            17716 non-null  int64
 2   CONSTITUENCY    17716 non-null  int64
 3   NAME            17716 non-null  int64
 4   GENDER          17716 non-null  int64
 5   CATEGORY        17716 non-null  int64
 6   AGE             17716 non-null  int64
 7   PARTY           17716 non-null  int64
 8   WINNER          17716 non-null  int64
 9   CRIMINAL_CASES  17716 non-null  int64
 10  EDUCATION       17716 non-null  int64
 11  ASSETS          17716 non-null  int64
 12  LIABILITIES     17716 non-null  int64
dtypes: int64(13)
memory usage: 1.8 MB


In [None]:
dataset.EDUCATION.replace({'Post Graduate\n':'Post Graduate','Not Given':'Not Available'},inplace=True)
print(dataset['EDUCATION'].value_counts())
dataset.GENDER.replace({'MALE':'M','FEMALE':'F'},inplace=True)
dataset.CATEGORY.replace({'GENERAL':'GEN','Gen':'GEN'},inplace=True)
print(dataset['GENDER'].value_counts())
print(dataset['CATEGORY'].value_counts())


Graduate                 2588
Post Graduate            2276
10th Pass                1857
12th Pass                1832
Graduate Professional    1417
8th Pass                  990
Not Available             590
5th Pass                  555
Literate                  544
Doctorate                 322
Others                    299
Illiterate                133
Name: EDUCATION, dtype: int64
M    16248
F     1462
O        6
Name: GENDER, dtype: int64
GEN    12094
SC      4200
ST      1422
Name: CATEGORY, dtype: int64


In [None]:
#Filling education, criminal cases, assets, liabilities with modal value
dataset['EDUCATION'].fillna('Graduate',inplace=True)
dataset['CRIMINAL_CASES'].fillna('0',inplace=True)
dataset['ASSETS'].fillna('0',inplace=True)
dataset['LIABILITIES'].fillna('0',inplace=True)

In [None]:
# replace Nil values with 0
dataset['ASSETS'] = dataset['ASSETS'].replace(['Nil', '`', 'Not Available'], '0')
dataset['LIABILITIES'] = dataset['LIABILITIES'].replace(['NIL', '`', 'Not Available'], '0')
dataset['CRIMINAL_CASES'] = dataset['CRIMINAL_CASES'].replace(['Not Available','NaN',''], '0')

# convert ASSETS, LIABILITIES and CRIMINAL_CASES column values into numeric
dataset['ASSETS'] = dataset['ASSETS'].astype(str).astype(float)
dataset['LIABILITIES'] = dataset['LIABILITIES'].astype(str).astype(float)
#removes float values
dataset['CRIMINAL_CASES'] = np.array(dataset['CRIMINAL_CASES'],dtype=int)
dataset['CRIMINAL_CASES'] = dataset['CRIMINAL_CASES'].astype(str).astype(int)
dataset['YEAR'] = dataset['YEAR'].astype(str)

In [None]:
dataset.isna().sum()

STATE               0
YEAR                0
CONSTITUENCY        0
NAME                0
GENDER            289
CATEGORY          289
AGE               289
PARTY              43
TOTAL_VOTES        43
POSITION            0
CRIMINAL_CASES      0
EDUCATION           0
ASSETS              0
LIABILITIES         0
dtype: int64

In [None]:
dataset = dataset.dropna()

In [None]:
dataset

Unnamed: 0,STATE,YEAR,CONSTITUENCY,NAME,GENDER,CATEGORY,AGE,PARTY,TOTAL_VOTES,POSITION,CRIMINAL_CASES,EDUCATION,ASSETS,LIABILITIES
0,Andhra Pradesh,2009,Adilabad,RATHOD RAMESH,M,ST,43,TDP,372268,1,0,Graduate,0,0
1,Andhra Pradesh,2009,Adilabad,KOTNAK RAMESH,M,ST,39,INC,257181,0,0,12th Pass,2600000,0
2,Andhra Pradesh,2009,Adilabad,MESRAM NAGO RAO,M,ST,59,PRAP,112930,0,0,Graduate,0,0
3,Andhra Pradesh,2009,Adilabad,ADE TUKARAM,M,ST,55,BJP,57931,0,0,10th Pass,865000,370000
4,Andhra Pradesh,2009,Adilabad,RATHOD SADASHIV NAIK,M,ST,50,BSP,16471,0,0,Graduate,1150000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2257,Maharashtra,2019,YAVATMAL-WASHIM,Vaishali Sudhakar Yede,F,GEN,28,PHJSP,20620,0,0,10th Pass,1168500,9000
2258,Maharashtra,2019,YAVATMAL-WASHIM,Anil Jayram Rathod,M,GEN,43,IND,14686,0,0,Post Graduate,4890000,1020000
2259,Telangana,2019,ZAHIRABAD,B.B.PATIL,M,GEN,63,TRS,434244,1,18,Graduate,1287851556,11535000
2260,Telangana,2019,ZAHIRABAD,MADAN MOHAN RAO,M,GEN,49,INC,428015,0,0,Post Graduate,903663001,0


In [None]:
dataset.to_excel("LS_3yr_data.xlsx")
from google.colab import files
files.download("LS_3yr_data.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Creating a prediction model


In [None]:
dataset = pd.read_excel('/content/LS_3yr_data.xlsx')

In [None]:
votes_col = []
series =dataset.groupby('CONSTITUENCY').sum().TOTAL_VOTES
for row in dataset.itertuples():
  cons = row.CONSTITUENCY
  votes_col.append(series[cons])
print(len(votes_col))
dataset['TOTAL_ELECTORS']= votes_col
  

17716


In [None]:
dataset

Unnamed: 0,STATE,YEAR,CONSTITUENCY,NAME,GENDER,CATEGORY,AGE,PARTY,TOTAL_VOTES,WINNER,CRIMINAL_CASES,EDUCATION,ASSETS,LIABILITIES
0,Andhra Pradesh,2009,Adilabad,RATHOD RAMESH,M,ST,43,TDP,372268,1,0,Graduate,0,0
1,Andhra Pradesh,2009,Adilabad,KOTNAK RAMESH,M,ST,39,INC,257181,0,0,12th Pass,2600000,0
2,Andhra Pradesh,2009,Adilabad,MESRAM NAGO RAO,M,ST,59,PRAP,112930,0,0,Graduate,0,0
3,Andhra Pradesh,2009,Adilabad,ADE TUKARAM,M,ST,55,BJP,57931,0,0,10th Pass,865000,370000
4,Andhra Pradesh,2009,Adilabad,RATHOD SADASHIV NAIK,M,ST,50,BSP,16471,0,0,Graduate,1150000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17711,Maharashtra,2019,YAVATMAL-WASHIM,Vaishali Sudhakar Yede,F,GEN,28,PHJSP,20620,0,0,10th Pass,1168500,9000
17712,Maharashtra,2019,YAVATMAL-WASHIM,Anil Jayram Rathod,M,GEN,43,IND,14686,0,0,Post Graduate,4890000,1020000
17713,Telangana,2019,ZAHIRABAD,B.B.PATIL,M,GEN,63,TRS,434244,1,18,Graduate,1287851556,11535000
17714,Telangana,2019,ZAHIRABAD,MADAN MOHAN RAO,M,GEN,49,INC,428015,0,0,Post Graduate,903663001,0


In [None]:
dataset1= dataset.copy()
dataset2= dataset.copy()
dataset3= dataset.copy()

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17716 entries, 0 to 17715
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   STATE           17716 non-null  object
 1   YEAR            17716 non-null  object
 2   CONSTITUENCY    17716 non-null  object
 3   NAME            17716 non-null  object
 4   GENDER          17716 non-null  object
 5   CATEGORY        17716 non-null  object
 6   AGE             17716 non-null  int64 
 7   PARTY           17716 non-null  object
 8   TOTAL_VOTES     17716 non-null  int64 
 9   WINNER          17716 non-null  int64 
 10  CRIMINAL_CASES  17716 non-null  int64 
 11  EDUCATION       17716 non-null  object
 12  ASSETS          17716 non-null  int64 
 13  LIABILITIES     17716 non-null  int64 
 14  TOTAL_ELECTORS  17716 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 2.0+ MB


In [None]:
# label encode categorical columns

lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset['STATE'])
dataset['STATE'] = lblEncoder_state.transform(dataset['STATE'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset['YEAR'])
dataset['YEAR'] = lblEncoder_cons.transform(dataset['YEAR'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset['CONSTITUENCY'])
dataset['CONSTITUENCY'] = lblEncoder_cons.transform(dataset['CONSTITUENCY'])

lblEncoder_name = LabelEncoder()
lblEncoder_name.fit(dataset['NAME'])
dataset['NAME'] = lblEncoder_name.transform(dataset['NAME'])

lblEncoder_party = LabelEncoder()
lblEncoder_party.fit(dataset['PARTY'])
dataset['PARTY'] = lblEncoder_party.transform(dataset['PARTY'])

lblEncoder_gender = LabelEncoder()
lblEncoder_gender.fit(dataset['GENDER'])
dataset['GENDER'] = lblEncoder_gender.transform(dataset['GENDER'])

lblEncoder_category = LabelEncoder()
lblEncoder_category.fit(dataset['CATEGORY'])
dataset['CATEGORY'] = lblEncoder_category.transform(dataset['CATEGORY'])

lblEncoder_edu = LabelEncoder()
lblEncoder_edu.fit(dataset['EDUCATION'])
dataset['EDUCATION'] = lblEncoder_edu.transform(dataset['EDUCATION'])



## First Model: Using KNeighborsClassifier

In [None]:
# separate train features and label
y = dataset["WINNER"]
X = dataset.drop(labels=["WINNER"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  91.19638826185101 %


## Second Model: Using MinMaxScaler

In [None]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features = [
    'STATE', 'CONSTITUENCY', 'NAME', 'PARTY', 'GENDER', 'CRIMINAL_CASES', 'AGE', 'CATEGORY', 'EDUCATION', 'ASSETS', 'LIABILITIES','TOTAL_VOTES','TOTAL_ELECTORS']
dataset2[features] = scaler.fit_transform(dataset[features])
# separate train features and label
y = dataset2["WINNER"]
X = dataset2.drop(labels=["WINNER"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  93.62302483069978 %


## Third Model

In [None]:
encoded_edu = []
# iterate through each row in the dataset
for row in dataset1.itertuples():
    education = row.EDUCATION
    if education == "Illiterate":
        encoded_edu.append(0)
    elif education == "Literate":
        encoded_edu.append(1)
    elif education == "5th Pass":
        encoded_edu.append(2)
    elif education == "8th Pass":
        encoded_edu.append(3)
    elif education == "10th Pass":
        encoded_edu.append(4)
    elif education == "12th Pass":
        encoded_edu.append(7)
    elif education == "Graduate":
        encoded_edu.append(8)
    elif education == "Post Graduate":
        encoded_edu.append(9)
    elif education == "Graduate Professional":
        encoded_edu.append(10)
    elif education == "Doctorate":
        encoded_edu.append(11)
    else:
        encoded_edu.append(5)
dataset1['EDUCATION'] = encoded_edu
dataset1['EDUCATION']

0        8
1        7
2        8
3        4
4        8
        ..
17711    4
17712    9
17713    8
17714    9
17715    7
Name: EDUCATION, Length: 17716, dtype: int64

In [None]:
temp = dataset1.value_counts('PARTY')
avg = temp.mean()
print(avg)

less_freq=[]
for k in temp.index:
    if temp[k] < avg :
      less_freq.append(k)

dataset1.loc[dataset1["PARTY"].isin(less_freq), "PARTY"] = "Other"
dataset1['PARTY'].value_counts()


25.63820549927641


IND      6946
Other    2449
INC      1309
BJP      1268
BSP      1141
         ... 
RSP        27
AAP        27
JaSPa      27
WPOI       26
LD         26
Name: PARTY, Length: 64, dtype: int64

In [None]:

lblEncoder_category.fit(dataset1['CATEGORY'])
dataset1['CATEGORY'] = lblEncoder_category.transform(dataset1['CATEGORY'])

lblEncoder_edu =lblEncoder_state = LabelEncoder()
lblEncoder_state.fit(dataset1['STATE'])
dataset1['STATE'] = lblEncoder_state.transform(dataset1['STATE'])

lblEncoder_cons = LabelEncoder()
lblEncoder_cons.fit(dataset1['CONSTITUENCY'])
dataset1['CONSTITUENCY'] = lblEncoder_cons.transform(dataset1['CONSTITUENCY'])

lblEncoder_name = LabelEncoder()
lblEncoder_name.fit(dataset1['NAME'])
dataset1['NAME'] = lblEncoder_name.transform(dataset1['NAME'])

lblEncoder_party = LabelEncoder()
lblEncoder_party.fit(dataset1['PARTY'])
dataset1['PARTY'] = lblEncoder_party.transform(dataset1['PARTY'])

lblEncoder_gender = LabelEncoder()
lblEncoder_gender.fit(dataset1['GENDER'])
dataset1['GENDER'] = lblEncoder_gender.transform(dataset1['GENDER'])

lblEncoder_category = LabelEncoder() LabelEncoder()
lblEncoder_edu.fit(dataset1['EDUCATION'])
dataset1['EDUCATION'] = lblEncoder_edu.transform(dataset1['EDUCATION'])

In [None]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features = [
    'STATE', 'CONSTITUENCY', 'NAME', 'PARTY', 'GENDER', 'CRIMINAL_CASES', 'AGE', 'CATEGORY', 'EDUCATION', 'ASSETS', 'LIABILITIES','TOTAL_VOTES','TOTAL_ELECTORS']
dataset1[features] = scaler.fit_transform(dataset1[features])
# separate train features and label
y = dataset1["WINNER"]
X = dataset1.drop(labels=["WINNER"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  93.98984198645599 %


## Fourth Model: Adding New Features

In [None]:
# Preparing feature values
cons_per_state = {}
voters_per_state = {}
party_winningSeats = {}
party_criminal = {}
party_education = {}
party_totalCandidates_per_cons = {}
party_winningSeats_per_cons = {}
party_criminal_per_cons = {}
party_education_per_cons = {}

# group by state
subset = dataset[['STATE', 'CONSTITUENCY', 'TOTAL_ELECTORS']]
gk = subset.groupby('STATE')
# for each state
for name,group in gk:
    # total constituencies per state
    cons_per_state[name] = len(group)
    
    # total voters per state
    voters_per_state[name] = group['TOTAL_ELECTORS'].sum()
# group by party
subset = dataset[['PARTY', 'CONSTITUENCY', 'CRIMINAL_CASES', 'EDUCATION', 'WINNER']]
gk = subset.groupby('PARTY')
# for each party
for name,group in gk:
    # winning seats by party
    party_winningSeats[name] = group[group['WINNER'] == 1.0].shape[0]
    
    # criminal cases by party
    party_criminal[name] = group['CRIMINAL_CASES'].sum()
    
    # education qualification by party (sum of candidates)
    party_education[name] = group['EDUCATION'].sum()
    
    # group by constituency
    gk2 = group.groupby('CONSTITUENCY')
    # for each constituency
    for name2, group2 in gk2:
        key = str(name2)+'_'+str(name)    # cons_party
        
        # total candidates by party in constituency
        party_totalCandidates_per_cons[key] = len(group2)
        
        # party winning seats in the constituency
        party_winningSeats_per_cons[key] = group2[group2['WINNER'] == 1.0].shape[0]
        
        # criminal cases by party in the constituency
        party_criminal_per_cons[key] = group2['CRIMINAL_CASES'].sum()
# education qualification by party in constituency (sum of candidates)
        party_education_per_cons[key] = group2['EDUCATION'].sum()

# Applying feature values
# new feature columns
total_cons_per_state = []
total_voters_per_state = []
total_voters_per_cons = []
winning_seats_by_party = []
criminal_by_party = []
education_by_party = []
total_candidates_by_party_per_cons = []
winning_seats_by_party_per_cons = []
criminal_by_party_per_cons = []
education_by_party_per_cons = []
# iterate through each row in the dataset
for row in dataset.itertuples():
    subkey = str(row.CONSTITUENCY)+'_'+str(row.PARTY)
    total_cons_per_state.append(cons_per_state.get(row.STATE))
    total_voters_per_state.append(voters_per_state.get(row.STATE))
    winning_seats_by_party.append(party_winningSeats.get(row.PARTY))
    criminal_by_party.append(party_criminal.get(row.PARTY))
    education_by_party.append(party_education.get(row.PARTY))
    total_candidates_by_party_per_cons.append(party_totalCandidates_per_cons.get(subkey))
    winning_seats_by_party_per_cons.append(party_winningSeats_per_cons.get(subkey))
    criminal_by_party_per_cons.append(party_criminal_per_cons.get(subkey))
    education_by_party_per_cons.append(party_education_per_cons.get(subkey))
# append columns to dataset
dataset['total_cons_per_state'] = total_cons_per_state
dataset['total_voters_per_state'] = total_voters_per_state
dataset['winning_seats_by_party'] = winning_seats_by_party
dataset['criminal_by_party'] = criminal_by_party
dataset['education_by_party'] = education_by_party
dataset['total_candidates_by_party_per_cons'] = total_candidates_by_party_per_cons
dataset['winning_seats_by_party_per_cons'] = winning_seats_by_party_per_cons
dataset['criminal_by_party_per_cons'] = criminal_by_party_per_cons
dataset['education_by_party_per_cons'] = education_by_party_per_cons
dataset

Unnamed: 0,STATE,YEAR,CONSTITUENCY,NAME,GENDER,CATEGORY,AGE,PARTY,TOTAL_VOTES,WINNER,CRIMINAL_CASES,EDUCATION,ASSETS,LIABILITIES,TOTAL_ELECTORS,total_cons_per_state,total_voters_per_state,winning_seats_by_party,criminal_by_party,education_by_party,total_candidates_by_party_per_cons,winning_seats_by_party_per_cons,criminal_by_party_per_cons,education_by_party_per_cons
0,1,0,44,12316,1,2,43,621,372268,1,0,5,0,0,863581,1261,2232097337,25,25,439,1,1,0,5
1,1,0,44,7151,1,2,39,250,257181,0,0,1,2600000,0,863581,1261,2232097337,301,1110,7710,1,0,0,1
2,1,0,44,8627,1,2,59,452,112930,0,0,5,0,0,863581,1261,2232097337,0,5,218,1,0,0,5
3,1,0,44,268,1,2,55,128,57931,0,0,0,865000,370000,863581,1261,2232097337,691,1384,7212,1,0,0,0
4,1,0,44,12322,1,2,50,174,16471,0,0,5,1150000,0,863581,1261,2232097337,30,560,5735,1,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17711,20,2,1088,16283,0,0,28,439,20620,0,0,0,1168500,9000,1120290,1886,3025394518,0,0,0,1,0,0,0
17712,20,2,1088,1556,1,0,43,251,14686,0,0,11,4890000,1020000,1120290,1886,3025394518,16,1122,32255,2,0,0,22
17713,33,2,1090,1631,1,0,63,631,434244,1,18,5,1287851556,11535000,1001206,74,77400108,22,51,252,1,1,18,5
17714,33,2,1090,7834,1,0,49,250,428015,0,0,11,903663001,0,1001206,74,77400108,301,1110,7710,1,0,0,11


In [None]:
# scaling values into 0-1 range
scaler = MinMaxScaler(feature_range=(0, 1))
features=dataset.columns
dataset[features] = scaler.fit_transform(dataset[features])
# separate train features and label
y = dataset["WINNER"]
X = dataset.drop(labels=["WINNER"], axis=1)
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  97.17832957110609 %


## Fifth Model: Identifying features with more weightage

In [None]:
# apply SelectKBest class to extract top most features
bestfeatures = SelectKBest(score_func=chi2, k="all")
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
# concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']
print(featureScores.sort_values('Score',ascending=False))

                                 Specs        Score
20     winning_seats_by_party_per_cons  5298.122154
8                          TOTAL_VOTES  2834.760957
16              winning_seats_by_party  2438.211037
18                  education_by_party   319.128383
19  total_candidates_by_party_per_cons   159.789413
1                                 YEAR   139.068185
22         education_by_party_per_cons   103.018210
17                   criminal_by_party    51.446672
6                                  AGE    45.173023
12                         LIABILITIES    27.935248
7                                PARTY    20.571177
10                           EDUCATION    15.606309
9                       CRIMINAL_CASES     4.400821
13                      TOTAL_ELECTORS     4.320566
14                total_cons_per_state     4.027907
11                              ASSETS     3.859139
15              total_voters_per_state     3.302984
5                             CATEGORY     3.237467
4           

In [None]:
X.drop(labels=["CRIMINAL_CASES", "TOTAL_ELECTORS", "total_cons_per_state", "ASSETS","total_voters_per_state", "CATEGORY", "GENDER","NAME","CONSTITUENCY" ,"criminal_by_party_per_cons"], axis=1, inplace=True)

In [None]:
# separate train features and label
y = dataset["WINNER"]
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# train and test knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)
print("Testing Accuracy is: ", knn.score(X_test, y_test)*100, "%")

Testing Accuracy is:  98.9841986455982 %
