-
Notifications
You must be signed in to change notification settings - Fork 0
/
manifest.py
128 lines (112 loc) · 4.45 KB
/
manifest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
'''Input file manifests for elcs analyses'''
import os
from glob import glob
from datetime import datetime
from windowspath import checkFile
class Columns():
def __init__(self):
self.target = ["personid", "byr", "MaByr", "PaByr", "MaAgeBr", "PaAgeBr", "MaDyr", "PaDyr", "MalastLivingDate", "PalastLivingDate", "NumSibs", "NumSibsDieChildhood"]
self.repro = ["BirthKnown", "AgeFirstBirthBin", "MaxParityBin"]
#self.income = ["HomeValue_Head1940", "RENT_ToHEAD", "EgoCenIncome", "MaCenIncome_New", "PaCenIncome_New"]
self.ucr = ["DistId", "CTC_TUMOR_MARKER1", "CTC_CS_SITE_SPECIFIC_FACTOR1", "DATE_OF_DIAGNOSIS_YYYY", "ER"]
self.measures = ["MaAgeBr", "AgeMaD", "AgePaD", "NumSibs", "SibsDieKnown", "MergedSEI", "MergedNP"]
self.newcol = ["byrBin", "AgeAtDiagnosis", "AgeMaD", "MaAgeBr", "AgePaD", "PaAgeBr", "SibsDieKnown", "MergedSEI", "MergedNP"]
self.adversity = ["Under10", "MAliveDiag", "MAlive18", "MaD<10", "PAliveDiag", "TeenMa", "PaD<10", "PAlive18", "SibDeath", "LowSES", ">5Sibs"]
self.scores = ["AdversityScore", "%Score","Complete", "AllMeasures", "CenterScore"]
self.events = ["Case", "Event", "Duration", "DiagnosisFrom1990"]
self.plot = ["AgeMaD", "MaAgeBr", "AgePaD", "PaAgeBr", "NumSibs", "SibsDieKnown", "MergedSEI", "MergedNP",
"byrBin", "Complete", "TeenMa","AgeFirstBirth", "MaxParity"]
def measureColumns():
# Returns list of all cancer measures
c = Columns()
return c.measures
def allColumns():
# Returns list of all column names
c = Columns()
ret = []
for i in [c.target, c.ucr, c.newcol, c.adversity, c.scores]:
ret.extend(i)
return ret
def reproductionColumns():
# Returns reproduction columns
c = Columns()
return c.repro
def newColumns(scores = True):
# Returns new columns
ret = []
c = Columns()
if scores:
ret.extend(c.repro)
ret.extend(c.newcol)
ret.extend(c.adversity)
if scores:
ret.extend(c.scores)
return ret
#-----------------------------------------------------------------------------
def fileTotals(step, case, control, new=False):
# Records totals for given file
filename = ("{}{}.csv").format(setPath(), "filteringTotals")
if new:
with open(filename, "w") as out:
out.write("File Name,Case,Control,Total\n")
with open(filename, "a") as out:
out.write(",".join([step, str(case), str(control), str(case + control)]) + "\n")
#-----------------------------------------------------------------------------
def __getTime__(f):
# Returns timestamp from filename
stamp = f[f.find(".")+1:f.rfind(".")]
return datetime.strptime(stamp, "%Y-%m-%d")
def __getNewest__(path):
# Returns file with newest datestamp
files = {}
g = glob(path)
if len(g) == 1:
return g[0]
for f in g:
if "_summary" not in f and "Mahima" not in f:
files[__getTime__(f)] = f
# Return newest file
mx = max(files.keys())
return files[mx]
def getMergedFile(subset = False, imputed = False):
# Returns path to most recent merged file
if subset:
infile = __getNewest__("Z:/ELCS/subsetUCRrecords.*.csv")
elif imputed:
infile = __getNewest__("Z:/ELCS/imputedUCRrecords.*.csv")
else :
infile = __getNewest__("Z:/ELCS/mergedUCRrecords.*.csv")
checkFile(infile)
return infile
def getCensusFiles():
# Returns path to census 1940 files
infiles = {}
infiles["case"] = "Z:/NewDataFromDavid/20220224_Census1940_Variables/David_Amy_BreastCancer_Main_20220224.csv"
infiles["control"] = "Z:/NewDataFromDavid/20220224_Census1940_Variables/David_Amy_BreastCancer_Main_Ctrl_20220224.csv"
return infiles
def getInfiles(orig = True):
# Returns dict of input files
infiles = {}
infiles["casecontrol"] = "Z:/NewDataFromDavid/David_Ken_BreastCancer_CaseControl_New.csv"
if orig == True:
# Read in original source data
infiles["ucr"] = "Z:/u0918416/Amycasedat_051916.csv"
infiles["case"] = "Z:/NewDataFromDavid/20191121/David_Amy_BreastCancer_Main_20191121.csv"
infiles["control"] = "Z:/NewDataFromDavid/20191121/David_Amy_BreastCancer_Main_Ctrl_20191121.csv"
for k in infiles.keys():
checkFile(infiles[k])
else:
# Read updated files
infiles["ucr"] = __getNewest__("Z:/ELCS/ucr.*.csv")
infiles["case"] = __getNewest__("Z:/ELCS/updbCases.*.csv")
infiles["control"] = __getNewest__("Z:/ELCS/updbControl.*.csv")
return infiles
def setOutfile(name):
# Formats filename with date and path
return ("{}{}.{}.csv").format(setPath(), name, datetime.now().strftime("%Y-%m-%d"))
def setPath():
# Sets path to outdir
wd = os.getcwd()
# Remove possible trailing slash and drop last directory
wd = wd[:-1]
return wd[:wd.rfind(os.path.sep)+1]