In [1]:
import os
import pandas as pd
import numpy as np
import datetime as dt

# Combine Files

## Iterate over files

In [54]:
files = [x for x in os.listdir('/scratch/ou/hohn/popularity_export/') if '.csv' in x]
files.sort()
ticker = [x.rsplit('.csv')[0] for x in files]

## Split tickers into chunks

In [55]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

jobs = chunks(ticker, int(len(ticker)/9))
num = int(os.environ.get('SGE_TASK_ID'))
sub_ticker = list(jobs)[num-1]

## Append data

In [64]:
time = []
users = []
tic = []
for ticker in sub_ticker:
    filepath = '/scratch/ou/hohn/popularity_export'
    filename = ticker + '.csv'
    pos = os.path.join(filepath, filename)
    read = pd.read_csv(pos, sep=',', header=0)
    time.extend(list(read.iloc[:,0]))
    users.extend(list(read.iloc[:,1]))
    tic.extend([ticker] * len(read))

# Create DataFrame

In [65]:
data = pd.DataFrame({'tic':tic, 'timestamp':time, 'users_holding':users})

## Define datetime variable and timezone

In [66]:
data['timestamp'] = pd.to_datetime(data['timestamp'], utc=True)

## Redefine date based on market closing

In [67]:
data['est'] = data['timestamp'].dt.tz_convert('US/Eastern')

In [68]:
market_close = dt.datetime.strptime('16:30', '%H:%M').time()
data['date'] = np.where(data['est'].dt.time > market_close, data['est'].dt.date + dt.timedelta(days=1), data['est'].dt.date)

In [69]:
g = data[['tic','date','users_holding']].groupby(['tic','date'])

In [70]:
daily = g.users_holding.aggregate(['first','last','max','min'])

In [None]:
filepath = '/scratch/ou/hohn'
filename = 'rh_pop' + num + '.json'
daily.to_json(os.path.join(filepath,filename),orient='index')

In [17]:
data = pd.read_pickle('/scratch/ou/hohn/rh_pop2.pkl')

In [18]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,first,last,min,max
tic,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BPOP,2018-05-02,119,119,119,119
BPOP,2018-05-03,119,120,119,120
BPOP,2018-05-04,120,120,120,120
BPOP,2018-05-05,121,121,121,121
BPOP,2018-05-06,121,121,121,121


In [25]:
comb = pd.DataFrame()
for i in range(1,11):
    filepath = '/scratch/ou/hohn'
    filename = 'rh_pop' + str(i) + '.pkl'
    comb = pd.concat([comb,pd.read_pickle(os.path.join(filepath, filename))])

In [27]:
comb.drop('first', axis=1, inplace=True)

In [36]:
comb.to_pickle('/scratch/ou/hohn/rh_pop.pkl')

In [2]:
!ls /scratch/ou/hohn

pandoc-scholar	   rh_pop1.pkl	rh_pop4.pkl  rh_pop7.pkl  rh_pop.pkl
popularity_export  rh_pop2.pkl	rh_pop5.pkl  rh_pop8.pkl
rh_pop10.pkl	   rh_pop3.pkl	rh_pop6.pkl  rh_pop9.pkl


In [3]:
comb = pd.read_pickle('/scratch/ou/hohn/rh_pop.pkl')

In [5]:
comb.loc['A']

Unnamed: 0_level_0,last,min,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-05-02,590.0,590.0,590.0
2018-05-03,587.0,586.0,587.0
2018-05-04,587.0,587.0,587.0
2018-05-05,588.0,588.0,588.0
2018-05-06,588.0,588.0,588.0
...,...,...,...
2020-08-10,1625.0,1616.0,1625.0
2020-08-11,1625.0,1625.0,1630.0
2020-08-12,1625.0,1620.0,1625.0
2020-08-13,1634.0,1628.0,1634.0


In [6]:
import wrds

In [7]:
db = wrds.Connection()

Loading library list...
Done


In [8]:
wrds?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'wrds' from '/home/ou/hohn/virtualenv/lib/python3.9/site-packages/wrds/__init__.py'>
[0;31mFile:[0m        ~/virtualenv/lib/python3.9/site-packages/wrds/__init__.py
[0;31mDocstring:[0m  
WRDS Python Data Access Library

WRDS-Py is a library for extracting data from WRDS data sources and getting it into Pandas.

    >>> import wrds
    >>> db = wrds.Connection()
    >>> db.list_libraries()
    ['aha', 'aha_sample', 'ahasamp', 'audit', 'audit_audit_comp', ...]
    >>> db.list_tables(library='crsp')
    ['acti', 'asia', 'asib', 'asic', 'asio', 'asix', 'bmdebt', 'bmheader', ...]
    >>> data = db.raw_sql('SELECT * FROM crsp.stocknames', index_col='permno')
    >>> data.head()
             permco      namedt   nameenddt     cusip    ncusip ticker      permno
    10000.0  7952.0  1986-01-07  1987-06-11  68391610  68391610  OMFGA
    10001.0  7953.0  1986-01-09  1993-11-21  36720410  39040610   GFGC
    10001.0  7953.0  1993-1

In [44]:
crsp_name = db.get_table('crsp','stocknames')

In [45]:
crsp_name.head()

Unnamed: 0,permno,permco,namedt,nameenddt,cusip,ncusip,ticker,comnam,hexcd,exchcd,siccd,shrcd,shrcls,st_date,end_date,namedum
0,10000.0,7952.0,1986-01-07,1987-06-11,68391610,68391610,OMFGA,OPTIMUM MANUFACTURING INC,3.0,3.0,3990.0,10.0,A,1986-01-31,1987-06-30,2.0
1,10001.0,7953.0,1986-01-09,1993-11-21,36720410,39040610,GFGC,GREAT FALLS GAS CO,2.0,3.0,4920.0,11.0,,1986-01-31,2017-08-31,2.0
2,10001.0,7953.0,1993-11-22,2008-02-04,36720410,29274A10,EWST,ENERGY WEST INC,2.0,3.0,4920.0,11.0,,1986-01-31,2017-08-31,2.0
3,10001.0,7953.0,2008-02-05,2009-08-03,36720410,29274A20,EWST,ENERGY WEST INC,2.0,3.0,4920.0,11.0,,1986-01-31,2017-08-31,2.0
4,10001.0,7953.0,2009-08-04,2009-12-17,36720410,29269V10,EGAS,ENERGY INC,2.0,3.0,4920.0,11.0,,1986-01-31,2017-08-31,2.0


In [2]:
import pandasql

In [3]:
pandasql??

[0;31mType:[0m        module
[0;31mString form:[0m <module 'pandasql' from '/home/ou/hohn/virtualenv/lzho/lib/python3.9/site-packages/pandasql/__init__.py'>
[0;31mFile:[0m        ~/virtualenv/lzho/lib/python3.9/site-packages/pandasql/__init__.py
[0;31mSource:[0m     
[0;32mfrom[0m [0;34m.[0m[0msqldf[0m [0;32mimport[0m [0;34m*[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mos[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mpandas[0m [0;32mas[0m [0mpd[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m[0m_ROOT[0m [0;34m=[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mabspath[0m[0;34m([0m[0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mdirname[0m[0;34m([0m[0m__file__[0m[0;34m)[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m[0;32mdef[0m [0mget_data[0m[0;34m([0m[0mpath[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mjoin[0m[0;34m([0

In [4]:
pip list sqlite3

Package               Version
--------------------- -----------
argon2-cffi           21.1.0
async-generator       1.10
attrs                 21.2.0
backcall              0.2.0
bleach                4.1.0
certifi               2021.10.8
cffi                  1.15.0
chardet               4.0.0
charset-normalizer    2.0.7
click                 8.0.3
cramjam               2.5.0
DateTime              4.3
debugpy               1.5.1
decorator             5.1.0
defusedxml            0.7.1
entrypoints           0.3
fastparquet           0.7.1
fsspec                2021.11.0
greenlet              1.1.2
idna                  3.3
iniconfig             1.1.1
ipykernel             6.5.0
ipython               7.29.0
ipython-genutils      0.2.0
ipywidgets            7.6.5
jedi                  0.18.0
Jinja2                3.0.3
jsonschema            4.2.1
jupyter               1.0.0
jupyter-client        7.0.6
jupyter-console       6.4.0
jupyter-core          4.9.1
jupyterlab-pygments   0.1.2
jupyte