-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
94 lines (72 loc) · 2.41 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import general_utils as Ugen
import pandas as pd
import glob, os
import logging as log
import database_operations as dbo
import config
import datetime
import DataCleaning
log.basicConfig(
format='%(asctime)s - %(module)s - %(levelname)s - %(message)s',
level=log.DEBUG, # Change debug level to choose how verbose you want logging to be
)
## IDEAS
#1) Location based profitability
#2) most profitable ticket types
#3) most common time of infraction
## GET DATA
# GET FILES
main_dir = os.getcwd()
allFiles = glob.glob(main_dir +'/src' + "/*.csv")
#LOAD TO DATAFRAME
print ('Attempting to load files')
dfs = []
for file_path in allFiles:
try:
df = pd.read_csv(file_path,index_col=None,quoting=2,
error_bad_lines=False) #quoting=2
dfs.append(df)
print ('successfully loaded: '+ file_path)
except Exception as err:
if 'EOF following escape character' in str(err):
try:
df = pd.read_csv(file_path,index_col=None,
encoding='utf-16',error_bad_lines=False)
dfs.append(df)
print ('successfully loaded: '+ file_path)
except Exception as err:
print ('NOT loaded: '+ file_path)
print (str(err))
else:
print ('NOT loaded: '+ file_path)
print (str(err))
tpt_df = pd.concat(dfs)
## CLEAN DF
# get rid of unnecessary columns
tpt_df = tpt_df.drop(['tag_number_masked','location3','location4'],axis=1)
print ('Creating date column')
tpt_df['date'] = tpt_df['date_of_infraction'].apply(
lambda x: datetime.datetime.strptime(str(x),'%Y%m%d'))
tpt_df = tpt_df.where((pd.notnull(tpt_df)), None)
def cleanRecord(record):
fields = []
for field in config.FIELD_MAP:
val = record.get(field['name'], None)
if val:
cleaned_val = getattr(DataCleaning, field['func'])(val=val,
length=field.get('length',0))
else:
cleaned_val = None
fields.append(cleaned_val)
return fields
print ('Converting to dict')
records = tpt_df.to_dict('records')
print ('Creating records list')
data = []
for record in records:
cleaned_record = cleanRecord(record)
data.append(cleaned_record)
print ('Loading %s rows to Postgres' % len(data))
conn = dbo.getConnection()
dbo.postgres_load(conn, 'tickets', data)
dbo.closeConnection(conn)