-
Notifications
You must be signed in to change notification settings - Fork 19
/
Data_Collector.py
208 lines (176 loc) · 7.16 KB
/
Data_Collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 25 11:50:05 2018
@author: kennedy
"""
__author__ = "kennedy Czar"
__email__ = "kennedyczar@gmail.com"
__version__ = '1.0'
import os
import time
from datetime import datetime
from os import chdir
from selenium import webdriver
from TICKERS import TICKER
class Data_collector(object):
def __init__(self, path):
'''
:Argument:
:path:
Enter the woring directory os state
the location you would love to create the dataset
:example:
create_folder('D:\\YourDirectory')
Creates the DATASET FOLDER @D:\\MyDirectory\\DATASET
:Complexity for file Creation:
The Function runs in:
Time Complexity: O(N*logN)
Space Complexity: O(1)
'''
self.path = path
try:
if os.path.exists(self.path):
try:
self.FOLDERS = ['\\DATASET',
'\\TICKERS',
'\\PREDICTED',
'\\MODEL']
FOLDER_COUNT = 0
for folders in self.FOLDERS:
'''If folder is not created or created but deleted..Recreate/Create the folder.
Check for all folders in the FOLDERS list'''
if not os.path.exists(self.path + self.FOLDERS[FOLDER_COUNT]):
os.makedirs(path + self.FOLDERS[FOLDER_COUNT])
print('====== 100% Completed ==== : {}'.format(self.path + self.FOLDERS[FOLDER_COUNT]))
FOLDER_COUNT += 1
elif os.path.exists(self.path + self.FOLDERS[FOLDER_COUNT]):
'''OR check if the file is already existing using a boolean..if true return'''
print('File Already Existing : {}'.format(self.path + self.FOLDERS[FOLDER_COUNT]))
FOLDER_COUNT += 1
except OSError as e:
'''raise OSError('File Already Existing {}'.format(e))'''
print('File Already existing: {}'.format(e))
elif not os.path.exists(self.path):
raise OSError('File path: {} does not exist\n\t\tPlease check the path again'.format(self.path))
else:
print('File Already Existing')
except Exception as e:
raise(e)
finally:
print('Process completed...Exiting')
def STOCK_EXTRACTOR(self, url, start, end):
'''
:Functionality:
Collects stock data using the yahoo API
Collects all excel data and stores in DATASET FOLDER
append .csv to all files downloaded
self.END = datetime.today().date()
'''
import fix_yahoo_finance as yahoo
import pandas as pd
from datetime import datetime
self.url = url
'''Set the start date'''
self.START = start
self.END = end
# start_date = pd.Timestamp(2010, 12, 29)
# end_date =
'''Create a list of stocks to download'''
self.TICKERS = []
self.SYMBOLS = TICKER(self.url).parse()
for ii in self.SYMBOLS['IB_Symbol'].values:
self.TICKERS.append(ii + '{}'.format('.MX'))
'''write the stock data to specific format by
appending the right extension'''
STOCK_TICKER_ = pd.DataFrame(self.TICKERS)
self.FORMAT = ['.csv', '.xlsx', '.json']
for extension in self.FORMAT:
STOCK_TICKER_.to_csv('../TICKERS/STOCK_TICKER{}'.format(extension))
print('======= Begin downloading stock dataset ======')
try:
self.unavailable_ticks = []
for self.TICK_SYMBOLS in self.TICKERS:
'''just in case your connection breaks,
we'd like to save our progress! by appending
downloaded dataset to DATASET FOLDER'''
if not os.path.exists('../DATASET/{}.csv'.format(self.TICK_SYMBOLS)):
try:
df = yahoo.download(self.TICK_SYMBOLS, start = self.START, end = self.END)
df.reset_index(inplace = True)
df.set_index("Date", inplace = True)
#check size of file before saving
import sys
if sys.getsizeof(df) <= 1024:
pass
else:
df.to_csv('../DATASET/{}.csv'.format(self.TICK_SYMBOLS))
except ValueError:
print('{} is unavailable'.format(self.TICK_SYMBOLS))
self.unavailable_ticks.append(self.TICK_SYMBOLS)
pass
else:
#this section redownloads the file even though it
#is already existing..
try:
df = yahoo.download(self.TICK_SYMBOLS, start = self.START, end = datetime.now())
df.reset_index(inplace = True)
df.set_index("Date", inplace = True)
#check size of file before saving
import sys
if sys.getsizeof(df) <= 1024:
pass
else:
df.to_csv('../DATASET/{}.csv'.format(self.TICK_SYMBOLS))
except ValueError:
print('{} is unavaible'.format(self.TICK_SYMBOLS))
self.unavailable_ticks.append(self.TICK_SYMBOLS)
pass
# print('File Already existing: {}'.format(self.TICK_SYMBOLS))
except OSError as e:
raise OSError('Something wrong with destination path: {}'.format(e))
finally:
print('API Download Completed..!')
print('*'*40)
print('External Download Begin..!')
print('Unavailable tickers are \n{}\n'.format(self.unavailable_ticks))
print('*'*60)
print('Downloading Unavailable data from YAHOO..')
print('*'*40)
print('External Download Completed..!')
print('*'*40)
print('Process Completed..!')
print('A total of {} unavailable tickers'.format(len(self.unavailable_ticks)))
def YAHOO_(path, start, end, stock_):
import pandas as pd
import numpy as np
date = [start, end]
date_epoch = pd.DatetimeIndex(date)
date_epoch = date_epoch.astype(np.int64) // 10**9
chrom_options_ = webdriver.ChromeOptions()
prefer_ = {'download.default_directory': path,
'profile.default_content_settings.popups': 0,
'directory_upgrade': True}
chrom_options_.add_experimental_option('prefs',prefer_)
for ii in stock_:
try:
yahoo_page_ = 'https://finance.yahoo.com/quote/{}/history?period1={}&period2={}&interval=1d&filter=history&frequency=1d'.format(ii, date_epoch[0], date_epoch[1])
driver = webdriver.Chrome("C:/chromedriver.exe", chrome_options = chrom_options_)
driver.minimize_window()
driver.get(yahoo_page_)
time.sleep(2)
driver.find_element_by_css_selector('.btn.primary').click()
time.sleep(2)
driver.find_element_by_css_selector('.Fl\(end\).Mt\(3px\).Cur\(p\)').click()
time.sleep(10)
driver.close()
except:
pass
if __name__ == '__main__':
#specify the start and end dates for stocks
import pandas as pd
start = pd.Timestamp(2010, 1, 1)
end = datetime.today().date()
'''Define a path on your drive where this project folder is located'''
path = 'D:\\BITBUCKET_PROJECTS\\Forecasting 1.0'
base_url = "https://www.interactivebrokers.com/en/index.php?f=2222&exch=mexi&showcategories=STK&p=&cc=&limit=100"
Data_collector(path).STOCK_EXTRACTOR(base_url, start, end)