-
Notifications
You must be signed in to change notification settings - Fork 0
/
impute.py
124 lines (110 loc) · 3.5 KB
/
impute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
'''Replaces empty values with imputed values'''
from collections import OrderedDict
from copy import deepcopy
from datetime import datetime
from manifest import *
from statistics import mean, stdev
from windowspath import *
class Impute():
def __init__(self):
self.count = {"case": 0, "control": 0}
self.header = {}
self.columns = measureColumns()
self.outfile = setOutfile("imputedUCRrecords")
self.totalimputed = setOutfile("imputedTotals")
self.measures = {}
self.imputed = OrderedDict()
self.totals = OrderedDict()
self.__setMeasures__()
self.__calculateMeasures__()
def __setMeasures__(self):
# Initializes measures and totals dicts
for i in self.columns:
self.measures[i] = []
self.totals[i] = [0, 0]
self.imputed[i] = "0"
def __imputeMeasures__(self):
# Calculates imputed values for each field
for i in self.columns:
m = mean(self.measures[i])
sd = stdev(self.measures[i])
self.measures[i] = str(m + sd)
def __getMeasures__(self, row):
# Appends value to list
for i in self.columns:
try:
val = int(row[self.header[i]])
if val >= 0:
self.measures[i].append(val)
except ValueError:
pass
def __calculateMeasures__(self):
# Reads values from merged file to impute values
first = True
print("\tCalculating imputed values...")
with open(getMergedFile(), "r") as f:
for line in f:
line = line.strip()
if first == False:
self.__getMeasures__(line.split(d))
else:
d = getDelim(line)
self.header = setHeader(line.split(d))
first = False
self.__imputeMeasures__()
#-----------------------------------------------------------------------------
def __writeTotals__(self):
# Writes number of imputed records to file
print("\tWriting total imputed records to file...")
with open(self.totalimputed, "w") as out:
out.write("Column,ImputedValue,#Imputed,Total,%\n")
for k in self.totals.keys():
p = self.totals[k][0]/self.totals[k][1]
out.write(("{},{},{},{},{:.2%}\n").format(k, self.measures[k], self.totals[k][0], self.totals[k][1], p))
def __replaceValues__(self, row):
# Inserts imputed values where needed
imp = deepcopy(self.imputed)
for i in self.columns:
self.totals[i][1] += 1
if not row[self.header[i]].strip() or row[self.header[i]] == "-1":
row[self.header[i]] = self.measures[i]
imp[i] = "1"
self.totals[i][0] += 1
row.extend(list(imp.values()))
if row[self.header["Case"]].strip() == "1":
self.count["case"] += 1
else:
self.count["control"] += 1
return row
def __outputHeader__(self, line, d):
# Appends dummy variable columns to header
row = line.strip().split(d)
for i in self.columns:
row.append("{}_imputed".format(i))
return d.join(row) + "\n"
def imputeRecords(self):
# Replaces missing data with imputed measures
first = True
print("\tReplacing missing data with imputed values...")
with open(self.outfile, "w") as out:
with open(getMergedFile(True), "r") as f:
for line in f:
if first == False:
line = line.strip()
s = line.split(d)
row = self.__replaceValues__(s)
out.write(",".join(row) + "\n")
else:
d = getDelim(line)
out.write(self.__outputHeader__(line, d))
first = False
self.__writeTotals__()
fileTotals("Imputed records", self.count["case"], self.count["control"])
def main():
start = datetime.now()
print("\n\tImputing missing data...")
i = Impute()
i.imputeRecords()
print(("\tFinished. Run time: {}\n").format(datetime.now() - start))
if __name__ == "__main__":
main()