# Processing Speed

This notebook is only for speeding up algorithms and reducing memory requirements






Check out concurrent features
https://towardsdatascience.com/heres-how-you-can-get-a-2-6x-speed-up-on-your-data-pre-processing-with-python-847887e63be5  

Big data  
https://www.dataquest.io/blog/pandas-big-data/  


In [184]:
path = '/Users/jacob/Desktop/studies/misc/kaggle/two_sigma_news/scripts'
import resource
import sys
import pandas_datareader as web
 
sys.path.append(path)

from sigma_libs import *
from sigma_1 import *
from assetExpand import *
tqdm.pandas()

In [185]:
import concurrent.futures


In [186]:

# DEMO DATA
market_train = pd.read_csv('../data/marketdata_sample.csv') 
news_train = pd.read_csv('../data/news_sample.csv')


## Data types

Some of the high dimension categoricals are consuming a lot of memory

In [187]:
for col in ['headlineTag', 'provider', 'sourceId']:
        news_train[col], uniques = pd.factorize(news_train[col])
        del uniques

__Functions to use:__


```
mem_usage(pandas_obj)  
d_type_usage(dframe) 
```


    

In [189]:
import os, psutil  

def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

In [None]:
# CHECK OUT TYPES. OBJECTS THAT ARE HIGH CARDINALITY SHOULD BE FACTORIZED
news_train.dtypes

## Practice Dataset

In [191]:
gl = pd.read_csv('../data/game_logs.csv')

In [192]:
gl.shape

gl.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 738.1 MB


In [193]:
for dtype in ['float','int','object']:
    selected_dtype = gl.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for float columns: 1.29 MB
Average memory usage for int columns: 1.12 MB
Average memory usage for object columns: 7.96 MB


We can see here the difference between __uint (unsigned integers)__ and __int (signed integers)__. Both types have the same capacity for storage, but by only storing positive values, unsigned integers allow us to be more efficient with our storage of columns that only contain positive values.

Will need to __downcast__

In [194]:
int_types = ["uint8", "int8", "int16"]
for it in int_types:
    print(np.iinfo(it))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------



In [195]:
# We're going to be calculating memory usage a lot,
# so we'll create a function to save us some time!

def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

In [196]:
gl_int = gl.select_dtypes(include=['int'])
converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(gl_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([gl_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

gl_float = gl.select_dtypes(include=['float'])
converted_float = gl_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(gl_float))
print(mem_usage(converted_float))

compare_floats = pd.concat([gl_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

optimized_gl = gl.copy()

optimized_gl[converted_int.columns] = converted_int
optimized_gl[converted_float.columns] = converted_float

print(mem_usage(gl))
print(mem_usage(optimized_gl))

7.00 MB
1.00 MB
100.00 MB
50.00 MB
738.00 MB
681.00 MB


While we've dramatically reduced the memory usage of our numeric columns, overall we've __only__ reduced the memory usage of our dataframe by 7%. Most of our gains are going to come from __optimizing the object types__.

In [197]:
### Why string storage sucks memory

from sys import getsizeof

s1 = 'working out'
s2 = 'memory usage for'
s3 = 'strings in python is fun!'
s4 = 'strings in python is fun!'

for s in [s1, s2, s3, s4]:
    print(getsizeof(s))

48
53
62
62


In [198]:
obj_series = pd.Series(['working out',
                          'memory usage for',
                          'strings in python is fun!',
                          'strings in python is fun!'])
obj_series.apply(getsizeof)

gl_obj = gl.select_dtypes(include=['object']).copy()
gl_obj.describe()

Unnamed: 0,day_of_week,v_name,v_league,h_name,h_league,day_night,completion,forefeit,protest,park_id,v_line_score,h_line_score,hp_umpire_id,hp_umpire_name,1b_umpire_id,1b_umpire_name,2b_umpire_id,2b_umpire_name,3b_umpire_id,3b_umpire_name,lf_umpire_id,lf_umpire_name,rf_umpire_id,rf_umpire_name,v_manager_id,v_manager_name,h_manager_id,h_manager_name,winning_pitcher_id,winning_pitcher_name,losing_pitcher_id,losing_pitcher_name,saving_pitcher_id,saving_pitcher_name,winning_rbi_batter_id,winning_rbi_batter_id_name,v_starting_pitcher_id,v_starting_pitcher_name,h_starting_pitcher_id,h_starting_pitcher_name,v_player_1_id,v_player_1_name,v_player_2_id,v_player_2_name,v_player_3_id,v_player_3_name,v_player_4_id,v_player_4_name,v_player_5_id,v_player_5_name,v_player_6_id,v_player_6_name,v_player_7_id,v_player_7_name,v_player_8_id,v_player_8_name,v_player_9_id,v_player_9_name,h_player_1_id,h_player_1_name,h_player_2_id,h_player_2_name,h_player_3_id,h_player_3_name,h_player_4_id,h_player_4_name,h_player_5_id,h_player_5_name,h_player_6_id,h_player_6_name,h_player_7_id,h_player_7_name,h_player_8_id,h_player_8_name,h_player_9_id,h_player_9_name,additional_info,acquisition_info
count,171907,171907,171907,171907,171907,140150,116,145,180,171907,147271,147271,171888,171891,147040,171891,88540,171127,116723,171135,203,171902,9,171902,171907,171907,171907,171907,140229,140229,140229,140229,48018,140838,105699,140838,171863,171863,171863,171863,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140835,140835,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,140838,1456,140841
unique,7,148,7,148,7,2,116,3,5,245,36367,37859,1149,1146,678,678,324,325,362,363,31,32,8,9,648,648,659,659,5123,5084,5653,5606,3133,3117,5739,5674,5193,5129,5170,5125,2870,2847,3709,3673,2989,2964,2581,2563,3757,3722,4794,4736,5301,5241,4812,4763,5643,5585,2802,2782,3648,3614,2881,2858,2533,2517,3696,3660,4774,4720,5253,5197,4760,4710,5193,5142,332,1
top,Sat,CHN,NL,CHN,NL,D,"19820711,CHI11,5,5,54",H,V,STL07,0,0,klemb901,Bill Klem,connt901,(none),westj901,(none),mcgob901,(none),sudoe901,(none),gormt101,(none),mackc101,Connie Mack,mackc101,Connie Mack,johnw102,Walter Johnson,rixee101,Dutch Leonard,rivem002,(none),pujoa001,(none),younc102,Young,younc102,Young,suzui001,Ichiro Suzuki,fox-n101,Nellie Fox,speat101,Tris Speaker,bottj101,Jim Bottomley,heilh101,Harry Heilmann,grimc101,Charlie Grimm,grimc101,Charlie Grimm,lopea102,Al Lopez,grifa001,Alfredo Griffin,suzui001,Ichiro Suzuki,fox-n101,Nellie Fox,speat101,Tris Speaker,gehrl101,Lou Gehrig,heilh101,Harry Heilmann,grimc101,Charlie Grimm,grimc101,Charlie Grimm,lopea102,Al Lopez,spahw101,Warren Spahn,HTBF,Y
freq,28891,8870,88866,9024,88867,82724,1,69,90,7022,10102,8028,3545,3545,2029,24851,815,82587,1129,54412,30,171699,2,171893,3901,3901,3848,3848,385,385,251,295,523,92820,288,35139,403,441,412,451,893,893,852,852,1224,1224,816,816,663,663,465,465,485,485,687,687,333,333,927,927,859,859,1165,1165,752,752,612,612,427,427,491,491,676,676,339,339,1112,140841


In [199]:
# As you can see, apart from the fact that the type of the column has 
# changed, the data looks exactly the same. Let's take a look under 
# the hood at what's happening.

dow = gl_obj.day_of_week
print(dow.head())

dow_cat = dow.astype('category')
print(dow_cat.head())

dow_cat.head().cat.codes

print(mem_usage(dow))
print(mem_usage(dow_cat))

0    Thu
1    Fri
2    Sat
3    Mon
4    Tue
Name: day_of_week, dtype: object
0    Thu
1    Fri
2    Sat
3    Mon
4    Tue
Name: day_of_week, dtype: category
Categories (7, object): [Fri, Mon, Sat, Sun, Thu, Tue, Wed]
7.00 MB
0.00 MB


In [200]:
converted_obj = pd.DataFrame()

for col in gl_obj.columns:
    num_unique_values = len(gl_obj[col].unique())
    num_total_values = len(gl_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = gl_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = gl_obj[col]

print(mem_usage(gl_obj))
print(mem_usage(converted_obj))

629.00 MB
48.00 MB


In [201]:
compare_obj = pd.concat([gl_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
# compare_obj.apply(pd.Series.value_counts)

optimized_gl[converted_obj.columns] = converted_obj

mem_usage(optimized_gl)

date = optimized_gl.date
print(mem_usage(date))
date.head()

0.00 MB


0    18710504
1    18710505
2    18710506
3    18710508
4    18710509
Name: date, dtype: uint32

In [None]:
optimized_gl['date'] = pd.to_datetime(date,format='%Y%m%d')

print(mem_usage(optimized_gl))
optimized_gl.date.head()

In [None]:
dtypes = optimized_gl.drop('date',axis=1).dtypes

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

# rather than print all 161 items, we'll
# sample 10 key/value pairs from the dict
# and print it nicely using prettyprint

preview = first2pairs = {key:value for key,value in list(column_types.items())[:10]}
import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

read_and_optimized = pd.read_csv('../data/game_logs.csv',
                                 dtype=column_types,
                                 parse_dates=['date'],
                                 infer_datetime_format=True)

print(mem_usage(read_and_optimized))
read_and_optimized.head()