# Cleaning Trial

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
constituency = '229'
folder = '/home/hennes/Internship/constituencies_edit/'
df = pd.read_csv(f'{folder}AC{constituency}.csv')

In [219]:
# make F nan

repl_dict = {'F': np.NaN} 

df = df.replace(repl_dict, regex=True)

In [220]:
# delete columns that have only have a few values in them. They are most likely useless.
df.dropna(thresh=len(df) - (len(df)/2), axis=1, inplace=True)
# delete rows that have more than 5 missing values
df.dropna(thresh = (len(df.columns)/1.4), axis = 0, inplace = True)
df.reset_index(drop=True, inplace=True)

Rectifying systematic errors

In [221]:
# transform systematic errors
repl_dict = {'\$':'5',
             'S':'5',
            '\(4\)':'(A)',
            '4\)':'(A)',
            '(\(A\))|(A\))|(\(A)|A':'A',
            '(\.0)$':'',
            'v':'0',
            '_':'',
            '\]':'',
            '\[':'',
            '\|':'',
            '\.':'',
            '[\(\)]':'',
            ' ': '',
            '(?!A)\D':'',
            '^\s*$':np.NaN} 

df = df.replace(repl_dict, regex=True)

In [222]:
# replace values with 4 or more digits with NAN

repl_dict = {'\d{4,}': np.NaN} 

df = df.replace(repl_dict, regex=True)
df = df.replace(r'\s+( +\.)|#',np.nan,regex=True).replace('',np.nan)

In [223]:
# delete rows that have more than 5 missing values
df.dropna(thresh = (len(df.columns)-3), axis = 0, inplace = True)
df.reset_index(drop=True, inplace=True)

Naming columns

In [224]:
# The two columns with the highest numbers should be total valid votes and total votes.
# Total valid votes is to the left of total votes.

# first need to convert columns to int. Should only do that with the non-serial number columns.
# This mask selects all columns that do not have 'A' in them.
mask = df[[e for e in df.columns]].apply(lambda x:
                                         x.astype(str).str.contains(r'A', regex=True)).any(axis='index')

# The first is all columns except serial number, second on is only serial number
serial = [df.iloc[:,2].name]
not_serial = df.loc[:,df.columns != serial[0]].columns.tolist()

# then convert all remaining characters to numeric or nan
for col in not_serial:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].astype(float)

# name column with highest median value 'total votes'    
df.rename(columns = {df.median().idxmax(axis=1):'total'}, inplace=True)

# exclude it and name the one with second highest median 'total_valid'
columns = [col for col in df.columns if not col.startswith('total')]
df.rename(columns = {df[columns].median().idxmax(axis=1):'total_valid'}, inplace=True)

# sometimes total and total valid will be switched.
# Check if column three places to left of total_valid is called total
# If so, switch their names
if df.columns.tolist()[df.columns.tolist().index('total_valid')-3] == 'total':
    df.rename(columns = {'total': 'total_valid', 'total_valid':'total'}, inplace=True)

In [225]:
# get names of columns one and two places to right of 'total valid'
sublist = ['total_valid']
rejected = df.columns.tolist()[(df.columns.get_indexer(sublist)+1)[0]]
nota = df.columns.tolist()[(df.columns.get_indexer(sublist)+2)[0]]
first = df.columns.tolist()[0]
second = df.columns.tolist()[1]

# rename first column 'page_idx' and other columns
df.rename(columns={rejected:'rejected',
                   nota:'nota',
                   first:'page_idx',
                   second:'serial_1',
                  serial[0]:'serial'}, inplace = True)

In [226]:
# delete all rows in which no cell has more than two digits (also accounting for .0)

mask = df.apply(lambda x: x.astype(str).str.contains(r'^\d{,2}(\.0)?$', regex=True)).all(axis=1)
df = df[~mask]
df.reset_index(drop=True, inplace=True)

  return func(self, *args, **kwargs)


In [227]:
# correcting columns wrongly shifted

# Identify all rows that have NAN in the column furthest right
# and which do not have an 'A' in the serial column. Those should not be moved
rowlist = df[(df.iloc[:,-1].isna()) & (~df['serial'].str.contains('A', na=False))].index.tolist()

# Calculate how many standard deviations all values of each row are away from the average of the respective columns 
for row in rowlist:
    collist = df.dtypes[df.dtypes == float].index.tolist()
    sdlist_old = []
    for col in collist:
        sdlist_old.append((((df.loc[row, col] - df[col].mean())**2)**0.5) / df[col].std())
    
# Compute the average standard deviation for each of these rows 
    rowsd_old = np.nanmedian(sdlist_old)

# Shift the values of the row to the right and report the new average standard deviation 
    df1 = df.copy(deep=True)
    df1.loc[row, :] = df1.loc[row, :].shift(1, axis=0)
    collist = df1.dtypes[df1.dtypes == float].index.tolist()
    sdlist_new = []
    for col in collist:
        sdlist_new.append((((df1.loc[row, col] - df1[col].mean())**2)**0.5) / df1[col].std())
    
    rowsd_new = np.nanmedian(sdlist_new)
    
# Take over the shift if the new SD is smaller than the old SD

    if rowsd_old > rowsd_new:
        df.loc[row] = df1.loc[row]
        print(f'Shifted row {row} to the right.')
        
        serial = [df.iloc[:,2].name]
        not_serial = df.loc[:,df.columns != serial[0]].columns.tolist()

        # then convert all remaining characters to numeric or nan
        for col in not_serial:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].astype(float)

Shifted row 188 to the right.
Shifted row 356 to the right.
Shifted row 357 to the right.
Shifted row 358 to the right.


## Correcting First Serial Numbers

In [228]:
# For errors without a gap (no row is missing)

# Idea is: if (n)+1 does not equal (n+1), then see if (n-1)+2 equals (n+1)
# This logic is extended for up to 5 numbers ahead of n. In this way, gaps 
# of up to 4 numbers will be bridged. At the same time, there will be no
# interpolation if there is no clean continuation of integers.

for n in df.index.tolist()[1:]:
    try:
        if df.iloc[n,1] != df.iloc[n-1,1]+1:
            if df.iloc[n-1,1]+2 == df.iloc[n+1,1]:
                df.iloc[n,1] = df.iloc[n-1,1]+1
            if df.iloc[n-1,1]+3 == df.iloc[n+2,1]:
                df.iloc[n,1] = df.iloc[n-1,1]+1
                df.iloc[n+1,1] = df.iloc[n-1,1]+2
            if df.iloc[n-1,1]+4 == df.iloc[n+3,1]:
                df.iloc[n,1] = df.iloc[n-1,1]+1
                df.iloc[n+1,1] = df.iloc[n-1,1]+2
                df.iloc[n+2,1] = df.iloc[n-1,1]+3
            if df.iloc[n-1,1]+5 == df.iloc[n+4,1]:
                df.iloc[n,1] = df.iloc[n-1,1]+1
                df.iloc[n+1,1] = df.iloc[n-1,1]+2
                df.iloc[n+2,1] = df.iloc[n-1,1]+3
                df.iloc[n+3,1] = df.iloc[n-1,1]+4
            if df.iloc[n-1,1]+6 == df.iloc[n+5,1]:
                df.iloc[n,1] = df.iloc[n-1,1]+1
                df.iloc[n+1,1] = df.iloc[n-1,1]+2
                df.iloc[n+2,1] = df.iloc[n-1,1]+3
                df.iloc[n+3,1] = df.iloc[n-1,1]+4
                df.iloc[n+4,1] = df.iloc[n-1,1]+5
    except:
        None
        
# For errors with a gap (occur often between last row of one page and first row of next)
# define index of last row of each page
last = [x-1 for x in df[df['page_idx']==0].index.tolist()[1:]]

# if (n) does not equal (n-1)+1, then make it so
for n in last:
    try:
        if df.iloc[n,1] != df.iloc[n-1,1]+1:
            df.iloc[n,1] = df.iloc[n-1,1]+1
    except:
        None

# for errors at beginning of pages

for n in df[df['page_idx']==0].index.tolist()[1:]:
    if df.iloc[n,1] != df.iloc[n+1,1]-1:
        df.iloc[n,1] = df.iloc[n+1,1]-1


before: 359.0, nan
after: 359.0, 360.0


In [229]:
# for errors at beginning of pages

for n in df[df['page_idx']==0].index.tolist()[1:]:
    if df.iloc[n,1] != df.iloc[n+1,1]-1:
        df.iloc[n,1] = df.iloc[n+1,1]-1


before: nan, 1.0
after: 0.0, 1.0


In [231]:
# trying out rule to take care of numbers missing digits

for n in [e for e in df.index.tolist()[1:] if e not in df[df['page_idx']==0].index.tolist()[1:]]:
    try:
        if df.iloc[n,1] != df.iloc[n-1,1]+1:
            print(f'before: {df.iloc[n-2,1]} first: {df.iloc[n-1,1]} second:{df.iloc[n,1]}')
    except:
        None

In [124]:
d = {'col1': [None, '1A', '', '', '', '', '334', '34', '66', '6', '68', '77A', '7', '79', '88A', '8', '89A'],
     'col2': ["", '1A', '', '', '', '', '334', '34', '66', '6', '68', '77A', '7', '79', '88A', '8', '89A']}
df = pd.DataFrame(data=d)

In [116]:
# for all rows except last row None
for n in df.index.tolist()[:-1]:
    # if the next row ends with an A
    if str(df.iloc[n+1,1]).endswith("A"):
        # then just make n = next row
        df.iloc[n,1] = int(''.join(c for c in df.iloc[n+1,1] if c.isdigit()))

# for all except first row
for n in df.index.tolist()[1:]:
    # if the former row ends with an A
    if str(df.iloc[n-1,1]).endswith("A"):
        # then just make n = former row + 1
        df.iloc[n,1] = int(''.join(c for c in df.iloc[n-1,1] if c.isdigit()))+1

# for all except first and last row
for n in df.index.tolist()[1:-1]:
    try:
        if (int(''.join(c for c in str(df.iloc[n-1,1]) if c.isdigit())) ==
            int(''.join(c for c in str(df.iloc[n+1,1]) if c.isdigit()))-1):
            df.iloc[n,1] = ''.join([str(int(''.join(c for c in str(df.iloc[n-1,1]) if c.isdigit()))), 'A'])
    except:
        print("this is n-1:" , df.iloc[n-1,2], '\n this is n: ',
              df.iloc[n,2], '\n this is n+1: ', df.iloc[n+1,2])
        
# in cases where n-1 = n+1 -2 (then n should not have an A and simply be n-1 + 1)
for n in df.index.tolist()[1:-1]:
    if str(df.iloc[n-1,1]) != '' and str(df.iloc[n+1,1]) != '':
        if (int(''.join(c for c in str(df.iloc[n-1,1]) if c.isdigit()))
        == int(''.join(c for c in str(df.iloc[n+1,1]) if c.isdigit()))-2):
            df.iloc[n,1] = int(''.join(c for c in df.iloc[n-1,1] if c.isdigit()))+1

IndexError: single positional indexer is out-of-bounds

In [126]:
df = df.fillna('nan')
df.iloc[0,0]

'nan'

## Renaming Columns according to Candidates

### Getting rank-party pairs

In [130]:
constituency = '025'

In [133]:
# import excel on candidate names
d = pd.read_excel('/home/hennes/Internship/Party_Data_2021.xlsx')
# define df excluding NOTA, only current constituency
dat = d[(d['PARTY']!= 'NOTA')
        & (d['AC NO.']== float(re.findall(r'[1-9][0-9]*',f'AC{constituency}.csv')[0]))][['AC NO.', 'PARTY', 'TOTAL']]
# get number of candidates
n_candidates = len(dat)

In [134]:
# create column with rank of party per constituency
dat['rank'] = dat.groupby('AC NO.').rank(ascending=False)
# create dict with party value pair
rank_party = pd.Series(dat.PARTY.values,index=dat['rank']).to_dict()
rank_party

{1.0: 'BJP', 2.0: 'AITC', 3.0: 'INC', 4.0: 'IND', 5.0: 'BSP', 6.0: 'SUCI'}

### Getting column-rank pairs

In [127]:
df = pd.read_csv(folder+'AC025.csv')

In [142]:
# create dictionary with key = column name, value = rank
serial = df.columns.get_indexer(['serial'])[0]
column_rank = df.iloc[:,serial+1:serial+(n_candidates+1)]\
    .agg(func=np.sum)\
    .rank(ascending=False)\
    .to_dict()

In [144]:
# Renaming column according to rank
rename_dict={}
for col, rank in column_rank.items():
    rename_dict.update({col:rank_party.get(rank)}) 
df.rename(columns=rename_dict, inplace=True)

In [145]:
df

Unnamed: 0,page_idx,serial_1,serial,BJP,AITC,INC,BSP,SUCI,IND,total_valid,rejected,nota,total,12,ac
0,0.0,2.0,2,203.0,273.0,8.0,3.0,2.0,6.0,495.0,0.0,15.0,510.0,0.0,25
1,1.0,3.0,3,300.0,513.0,20.0,3.0,4.0,10.0,450.0,4.0,11.0,461.0,0.0,25
2,2.0,4.0,4,380.0,159.0,17.0,4.0,2.0,7.0,569.0,0.0,17.0,586.0,0.0,25
3,3.0,5.0,5,311.0,258.0,19.0,8.0,6.0,8.0,610.0,0.0,23.0,633.0,0.0,25
4,4.0,6.0,6,295.0,192.0,63.0,3.0,3.0,11.0,567.0,0.0,20.0,587.0,0.0,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,0.0,386.0,304,409.0,242.0,85.0,6.0,6.0,2.0,750.0,0.0,14.0,764.0,0.0,25
371,1.0,387.0,305,381.0,198.0,5.0,2.0,9.0,5.0,670.0,0.0,10.0,680.0,0.0,25
372,2.0,388.0,305A,304.0,174.0,48.0,3.0,2.0,6.0,539.0,0.0,5.0,544.0,0.0,25
373,3.0,389.0,306,247.0,189.0,65.0,2.0,2.0,2.0,507.0,0.0,4.0,511.0,0.0,25


### Renaming column according to rank

In [1]:
# Importing Packages

import pandas as pd
import numpy as np
import re
import glob
import os
import concurrent # for parallel instances
import warnings

# Defining Selection of Constituencies

df = pd.read_excel('/home/hennes/Downloads/Book.xlsx')

# get relevant pdf numbers

worklist = df[df['Ready for Cleaning and Merging?'] == 'y']['Constituency number'].tolist()

# give appropriate filename endings to items

for idx, item in enumerate(worklist):
    if len(str(item)) == 1:
        worklist[idx] = f'AC00{item}.csv'
    if len(str(item)) == 2:
        worklist[idx] = f'AC0{item}.csv'
    if len(str(item)) == 3:
        worklist[idx] = f'AC{item}.csv'
        
worklist = tuple(worklist)

# Defining Folders
folder = '/home/hennes/Internship/constituencies/'
save_folder = '/home/hennes/Internship/constituencies_edit/'
old = '/home/hennes/Internship/old_files/'
candidates = pd.read_csv('/home/hennes/Internship/Party_Data_2019.csv')
PC_AC = set(sorted([folder.split('-')[0]+'-'+ folder.split('-')[1] for folder in next(os.walk(old))[1]]))
PC_AC_dict = {e.split('-')[1]: e.split('-')[0] for e in PC_AC}
constituencies = sorted([os.path.split(file)[-1] for file in glob.glob(folder+'*') if file.endswith(".csv")]) # list with all files
constituencies = [file for file in constituencies if file.endswith(worklist)]

In [6]:
constituency = 'AC005.csv'
df = pd.read_csv('/home/hennes/Internship/constituencies_edit/AC005.csv')
candidate_df = pd.read_csv('/home/hennes/Internship/Party_Data_2019.csv')

In [25]:
# get appropriate constituency number
con_n = PC_AC_dict[constituency.split('.')[0]].split('C')[-1].replace('0', '')

# define df excluding NOTA, only current constituency
dat = candidate_df[(candidate_df['Party']!= 'NOTA') & (candidate_df['Constituency_No'] == int(con_n))]\
[['Constituency_No', 'Party', 'Votes', 'Constituency_Name']]

In [27]:
dat['Votes'] = pd.to_numeric(dat['Votes'], errors='coerce')
# create column with rank of party per constituency
dat['rank'] = dat.groupby('Constituency_No').rank(ascending=False)
# get number of candidates
n_candidates = len(dat)
# create dict with party value pair
rank_party = pd.Series(dat.Party.values,index=dat['rank']).to_dict()

# create dictionary with key = column name, value = rank
serial = df.columns.get_indexer(['serial'])[0]
column_rank = df.iloc[:,serial+1:serial+(n_candidates+1)]\
    .agg(func=np.sum)\
    .rank(ascending=False)\
    .to_dict()

df['pc'] = con_n

# Renaming column according to rank
rename_dict={}
for col, rank in column_rank.items():
    rename_dict.update({col:rank_party.get(rank)}) 
df.rename(columns=rename_dict, inplace=True)