# Uvoz podatkov

## Text encoding: ASCII, Unicode, and others

In [1]:
with open('data/out2.txt', 'wb') as f:
    f.write(bytes([65,66,67,255,12,193]))

In [2]:
!cat data/out2.txt

ABC��

`ABCÿÀÁ`

In [3]:
with open('data/out2.txt') as f:
    f.read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 3: invalid start byte

In [5]:
with open('data/out2.txt', 'rb') as f:
    print(f.read())

b'ABC\xff\x0c\xc1'


In [7]:
with open('data/out2.txt', errors='ignore') as f:
    print(f.read())

ABC


In [8]:
with open('data/out2.txt', errors='replace') as f:
    print(f.read())

ABC��


In [9]:
with open('data/out2.txt', errors='backslashreplace') as f:
    print(f.read())

ABC\xff\xc1


## Reading and Writing Data with pandas

In [10]:
import pandas as pd
import numpy as np

[IO tools (text, CSV, HDF5, …)](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html)

<table class="colwidths-given table">
<colgroup>
<col style="width: 12%">
<col style="width: 40%">
<col style="width: 24%">
<col style="width: 24%">
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Format Type</p></th>
<th class="head"><p>Data Description</p></th>
<th class="head"><p>Reader</p></th>
<th class="head"><p>Writer</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>text</p></td>
<td><p><a class="reference external" href="https://en.wikipedia.org/wiki/Comma-separated_values">CSV</a></p></td>
<td><p><a class="reference internal" href="#io-read-csv-table"><span class="std std-ref">read_csv</span></a></p></td>
<td><p><a class="reference internal" href="#io-store-in-csv"><span class="std std-ref">to_csv</span></a></p></td>
</tr>
<tr class="row-odd"><td><p>text</p></td>
<td><p>Fixed-Width Text File</p></td>
<td><p><a class="reference internal" href="#io-fwf-reader"><span class="std std-ref">read_fwf</span></a></p></td>
<td></td>
</tr>
<tr class="row-even"><td><p>text</p></td>
<td><p><a class="reference external" href="https://www.json.org/">JSON</a></p></td>
<td><p><a class="reference internal" href="#io-json-reader"><span class="std std-ref">read_json</span></a></p></td>
<td><p><a class="reference internal" href="#io-json-writer"><span class="std std-ref">to_json</span></a></p></td>
</tr>
<tr class="row-odd"><td><p>text</p></td>
<td><p><a class="reference external" href="https://en.wikipedia.org/wiki/HTML">HTML</a></p></td>
<td><p><a class="reference internal" href="#io-read-html"><span class="std std-ref">read_html</span></a></p></td>
<td><p><a class="reference internal" href="#io-html"><span class="std std-ref">to_html</span></a></p></td>
</tr>
<tr class="row-even"><td><p>text</p></td>
<td><p>Local clipboard</p></td>
<td><p><a class="reference internal" href="#io-clipboard"><span class="std std-ref">read_clipboard</span></a></p></td>
<td><p><a class="reference internal" href="#io-clipboard"><span class="std std-ref">to_clipboard</span></a></p></td>
</tr>
<tr class="row-odd"><td></td>
<td><p><a class="reference external" href="https://en.wikipedia.org/wiki/Microsoft_Excel">MS Excel</a></p></td>
<td><p><a class="reference internal" href="#io-excel-reader"><span class="std std-ref">read_excel</span></a></p></td>
<td><p><a class="reference internal" href="#io-excel-writer"><span class="std std-ref">to_excel</span></a></p></td>
</tr>
<tr class="row-even"><td><p>binary</p></td>
<td><p><a class="reference external" href="http://www.opendocumentformat.org">OpenDocument</a></p></td>
<td><p><a class="reference internal" href="#io-ods"><span class="std std-ref">read_excel</span></a></p></td>
<td></td>
</tr>
<tr class="row-odd"><td><p>binary</p></td>
<td><p><a class="reference external" href="https://support.hdfgroup.org/HDF5/whatishdf5.html">HDF5 Format</a></p></td>
<td><p><a class="reference internal" href="#io-hdf5"><span class="std std-ref">read_hdf</span></a></p></td>
<td><p><a class="reference internal" href="#io-hdf5"><span class="std std-ref">to_hdf</span></a></p></td>
</tr>
<tr class="row-even"><td><p>binary</p></td>
<td><p><a class="reference external" href="https://github.com/wesm/feather">Feather Format</a></p></td>
<td><p><a class="reference internal" href="#io-feather"><span class="std std-ref">read_feather</span></a></p></td>
<td><p><a class="reference internal" href="#io-feather"><span class="std std-ref">to_feather</span></a></p></td>
</tr>
<tr class="row-odd"><td><p>binary</p></td>
<td><p><a class="reference external" href="https://parquet.apache.org/">Parquet Format</a></p></td>
<td><p><a class="reference internal" href="#io-parquet"><span class="std std-ref">read_parquet</span></a></p></td>
<td><p><a class="reference internal" href="#io-parquet"><span class="std std-ref">to_parquet</span></a></p></td>
</tr>
<tr class="row-even"><td><p>binary</p></td>
<td><p><a class="reference external" href="//https://orc.apache.org/">ORC Format</a></p></td>
<td><p><a class="reference internal" href="#io-orc"><span class="std std-ref">read_orc</span></a></p></td>
<td></td>
</tr>
<tr class="row-odd"><td><p>binary</p></td>
<td><p><a class="reference external" href="https://msgpack.org/index.html">Msgpack</a></p></td>
<td><p><a class="reference internal" href="#io-msgpack"><span class="std std-ref">read_msgpack</span></a></p></td>
<td><p><a class="reference internal" href="#io-msgpack"><span class="std std-ref">to_msgpack</span></a></p></td>
</tr>
<tr class="row-even"><td><p>binary</p></td>
<td><p><a class="reference external" href="https://en.wikipedia.org/wiki/Stata">Stata</a></p></td>
<td><p><a class="reference internal" href="#io-stata-reader"><span class="std std-ref">read_stata</span></a></p></td>
<td><p><a class="reference internal" href="#io-stata-writer"><span class="std std-ref">to_stata</span></a></p></td>
</tr>
<tr class="row-odd"><td><p>binary</p></td>
<td><p><a class="reference external" href="https://en.wikipedia.org/wiki/SAS_(software)">SAS</a></p></td>
<td><p><a class="reference internal" href="#io-sas-reader"><span class="std std-ref">read_sas</span></a></p></td>
<td></td>
</tr>
<tr class="row-even"><td><p>binary</p></td>
<td><p><a class="reference external" href="https://en.wikipedia.org/wiki/SPSS">SPSS</a></p></td>
<td><p><a class="reference internal" href="#io-spss-reader"><span class="std std-ref">read_spss</span></a></p></td>
<td></td>
</tr>
<tr class="row-odd"><td><p>binary</p></td>
<td><p><a class="reference external" href="https://docs.python.org/3/library/pickle.html">Python Pickle Format</a></p></td>
<td><p><a class="reference internal" href="#io-pickle"><span class="std std-ref">read_pickle</span></a></p></td>
<td><p><a class="reference internal" href="#io-pickle"><span class="std std-ref">to_pickle</span></a></p></td>
</tr>
<tr class="row-even"><td><p>SQL</p></td>
<td><p><a class="reference external" href="https://en.wikipedia.org/wiki/SQL">SQL</a></p></td>
<td><p><a class="reference internal" href="#io-sql"><span class="std std-ref">read_sql</span></a></p></td>
<td><p><a class="reference internal" href="#io-sql"><span class="std std-ref">to_sql</span></a></p></td>
</tr>
<tr class="row-odd"><td><p>SQL</p></td>
<td><p><a class="reference external" href="https://en.wikipedia.org/wiki/BigQuery">Google BigQuery</a></p></td>
<td><p><a class="reference internal" href="#io-bigquery"><span class="std std-ref">read_gbq</span></a></p></td>
<td><p><a class="reference internal" href="#io-bigquery"><span class="std std-ref">to_gbq</span></a></p></td>
</tr>
</tbody>
</table>

### CSV files

#### Primer 1: seaslug.txt

In [11]:
!head -n 5 data/seaslug.txt

Time	Percent
99	0.067
99	0.133
99	0.067
99	0


In [14]:
pd.read_csv('data/seaslug.txt', sep='\t').head()

Unnamed: 0,Time,Percent
0,99,0.067
1,99,0.133
2,99,0.067
3,99,0.0
4,99,0.0


- `sep: str, defaults to ',' for read_csv(), \t for read_table()`: Delimiter to use. If sep is None, the C engine cannot automatically detect the separator, but the Python parsing engine can, meaning the latter will be used and automatically detect the separator by Python’s builtin sniffer tool, csv.Sniffer. In addition, separators longer than 1 character and different from '\s+' will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex delimiters are prone to ignoring quoted data. Regex example: '\\r\\t'.

- `delimiter: str, default None`: Alias for sep.

#### Primer 2: FOOD_DES.txt

Encoding: `iso-8859-1`

In [15]:
!head -n 5 data/FOOD_DES.txt

~01001~^~0100~^~Butter, salted~^~BUTTER,WITH SALT~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87
~01002~^~0100~^~Butter, whipped, with salt~^~BUTTER,WHIPPED,W/ SALT~^~~^~~^~Y~^~~^0^~~^6.38^^^
~01003~^~0100~^~Butter oil, anhydrous~^~BUTTER OIL,ANHYDROUS~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87
~01004~^~0100~^~Cheese, blue~^~CHEESE,BLUE~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87
~01005~^~0100~^~Cheese, brick~^~CHEESE,BRICK~^~~^~~^~Y~^~~^0^~~^6.38^4.27^8.79^3.87


In [25]:
pd.read_csv('data/FOOD_DES.txt', encoding='iso-8859-1', sep='^', nrows=10, header=None, quotechar='~')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1001,100,"Butter, salted","BUTTER,WITH SALT",,,Y,,0,,6.38,4.27,8.79,3.87
1,1002,100,"Butter, whipped, with salt","BUTTER,WHIPPED,W/ SALT",,,Y,,0,,6.38,,,
2,1003,100,"Butter oil, anhydrous","BUTTER OIL,ANHYDROUS",,,Y,,0,,6.38,4.27,8.79,3.87
3,1004,100,"Cheese, blue","CHEESE,BLUE",,,Y,,0,,6.38,4.27,8.79,3.87
4,1005,100,"Cheese, brick","CHEESE,BRICK",,,Y,,0,,6.38,4.27,8.79,3.87
5,1006,100,"Cheese, brie","CHEESE,BRIE",,,Y,,0,,6.38,4.27,8.79,3.87
6,1007,100,"Cheese, camembert","CHEESE,CAMEMBERT",,,Y,,0,,6.38,4.27,8.79,3.87
7,1008,100,"Cheese, caraway","CHEESE,CARAWAY",,,,,0,,6.38,4.27,8.79,3.87
8,1009,100,"Cheese, cheddar","CHEESE,CHEDDAR",,,Y,,0,,,,,
9,1010,100,"Cheese, cheshire","CHEESE,CHESHIRE",,,,,0,,6.38,4.27,8.79,3.87


- `nrows: int, default None` Number of rows of file to read. Useful for reading pieces of large files.

- `header: int or list of ints, default 'infer'` Row number(s) to use as the column names, and the start of the data. Default behavior is to infer the column names: if no names are passed the behavior is identical to header=0 and column names are inferred from the first line of the file, if column names are passed explicitly then the behavior is identical to header=None. Explicitly pass header=0 to be able to replace existing names.

- `encoding: str, default None` Encoding to use for UTF when reading/writing (e.g. 'utf-8'). [List of Python standard encodings](https://docs.python.org/3/library/codecs.html#standard-encodings).

- `quotechar: str (length 1)`: The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored.

#### Primer 3: MplsStops.csv

In [26]:
! head -n 3 ./data/mpls_stops.csv

Unnamed: 0,id Num,date,problem,MDC,citation Issued,person Search,vehicle Search,pre Race,race,gender,lat,long,police Precinct,neighborhood
,idNum,date,problem,MDC,citationIssued,personSearch,vehicleSearch,preRace,race,gender,lat,long,policePrecinct,neighborhood
6823.0,17-000003,2017-01-01 00:00:42,suspicious,MDC,,NO,NO,Unknown,Unknown,Unknown,44.96661711,-93.24645826,1,Cedar Riverside


In [27]:
mpls = pd.read_csv('data/mpls_stops.csv', nrows=3)
mpls

Unnamed: 0.1,Unnamed: 0,id Num,date,problem,MDC,citation Issued,person Search,vehicle Search,pre Race,race,gender,lat,long,police Precinct,neighborhood
0,,idNum,date,problem,MDC,citationIssued,personSearch,vehicleSearch,preRace,race,gender,lat,long,policePrecinct,neighborhood
1,6823.0,17-000003,2017-01-01 00:00:42,suspicious,MDC,,NO,NO,Unknown,Unknown,Unknown,44.96661711,-93.24645826,1,Cedar Riverside
2,6824.0,17-000007,2017-01-01 00:03:07,suspicious,MDC,,NO,NO,Unknown,Unknown,Male,44.98045,-93.27134,1,Downtown West


In [28]:
mpls.columns

Index(['Unnamed: 0', 'id Num', 'date', 'problem', 'MDC', 'citation Issued',
       'person Search', 'vehicle Search', 'pre Race', 'race', 'gender', 'lat',
       'long', 'police Precinct', 'neighborhood'],
      dtype='object')

In [29]:
new_columns_names = ['Unnamed: 0', 'id Num', 'date', 'problem', 'MDC', 'citation Issued', 
                     'person Search', 'vehicle Search', 'pre Race', 'race', 'gender', 'lat',
                     'long', 'police Precinct', 'neighborhood']

In [33]:
new_columns_names = [name.lower().replace(' ', '_') for name in new_columns_names]
new_columns_names[0] = 'case_number_id'
print(new_columns_names)

['case_number_id', 'id_num', 'date', 'problem', 'mdc', 'citation_issued', 'person_search', 'vehicle_search', 'pre_race', 'race', 'gender', 'lat', 'long', 'police_precinct', 'neighborhood']


In [54]:
mpls = pd.read_csv('data/mpls_stops.csv', 
                    names=new_columns_names,
                    skiprows=2,
                    nrows= 5,
                    engine='c',
                    true_values=['YES'],
                    false_values=['NO'],
                    dtype={'mdc': 'category', 'problem':'category', 'citation_issued': 'float',
                         'person_search': 'float', 'vehicle_search': 'float',  'pre_race':'category'},
                    index_col='case_number_id',
                    na_values=['Unknown'],
                    parse_dates=['date'])
mpls

Unnamed: 0_level_0,id_num,date,problem,mdc,citation_issued,person_search,vehicle_search,pre_race,race,gender,lat,long,police_precinct,neighborhood
case_number_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
6823.0,17-000003,2017-01-01 00:00:42,suspicious,MDC,,0.0,0.0,,,,44.966617,-93.246458,1,Cedar Riverside
6824.0,17-000007,2017-01-01 00:03:07,suspicious,MDC,,0.0,0.0,,,Male,44.98045,-93.27134,1,Downtown West
6825.0,17-000073,2017-01-01 00:23:15,traffic,MDC,,0.0,0.0,,White,Female,44.94835,-93.27538,5,Whittier
6826.0,17-000092,2017-01-01 00:33:48,suspicious,MDC,,0.0,0.0,,East African,Male,44.94836,-93.28135,5,Whittier
6827.0,17-000098,2017-01-01 00:37:58,traffic,MDC,,0.0,0.0,,White,Female,44.979078,-93.262076,1,Downtown West


In [55]:
mpls.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 5 entries, 6823.0 to 6827.0
Data columns (total 14 columns):
id_num             5 non-null object
date               5 non-null datetime64[ns]
problem            5 non-null category
mdc                5 non-null category
citation_issued    0 non-null float64
person_search      5 non-null float64
vehicle_search     5 non-null float64
pre_race           0 non-null category
race               3 non-null object
gender             4 non-null object
lat                5 non-null float64
long               5 non-null float64
police_precinct    5 non-null int64
neighborhood       5 non-null object
dtypes: category(3), datetime64[ns](1), float64(5), int64(1), object(4)
memory usage: 519.0+ bytes


- `names: array-like, default None` List of column names to use. If file contains no header row, then you should explicitly pass header=None. Duplicates in this list are not allowed.


- `skiprows: list-like or integer, default None` Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file.

- `engine: {'c', 'python'}` Parser engine to use. The C engine is faster while the Python engine is currently more feature-complete.

In [38]:
%timeit mpls = pd.read_csv('data/mpls_stops.csv', names=new_columns_names, skiprows=2, engine='python')

1.23 s ± 36.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%timeit mpls = pd.read_csv('data/mpls_stops.csv', names=new_columns_names, skiprows=2, engine='c')

341 ms ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


- `true_values: list, default None` Values to consider as True.
- `false_values: list, default None` Values to consider as False.

- `index_col: int, str, sequence of int / str, or False, default None` Column(s) to use as the row labels of the DataFrame, either given as string name or column index. If a sequence of int / str is given, a MultiIndex is used.

- `dtype: Type name or dict of column -> type, default None` Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (unsupported with engine='python'). Use str or object together with suitable na_values settings to preserve and not interpret dtype.

- `parse_dates: boolean or list of ints or names or list of lists or dict, default False.` 
    - If True -> try parsing the index.
    - If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column.
    - If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column.
    - If {'foo': [1, 3]} -> parse columns 1, 3 as date and call result ‘foo’. A fast-path exists for iso8601-formatted dates.

- `date_parserfunction, default None` Function to use for converting a sequence of string columns to an array of datetime instances. The default uses dateutil.parser.parser to do the conversion. pandas will try to call date_parser in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string values from the columns defined by parse_dates into a single array and pass that; and 3) call date_parser once for each row using one or more strings (corresponding to the columns defined by parse_dates) as arguments.

- `na_values: scalar, str, list-like, or dict, default None` Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. See na values const below for a list of the values interpreted as NaN by default.

#### Primer 4: iperf.txt

In [63]:
!head -n 20 data/iperf.txt

Wed Aug 15 19:35:11 CEST 2018
Connecting to host x.x.x.x, port 5201
[  4] local x.x.x.x port 48944 connected to x.x.x.x port 5201
[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
[  4]   0.00-1.00   sec   375 MBytes  3.14 Gbits/sec  273    471 KBytes
[  4]   1.00-2.00   sec   428 MBytes  3.59 Gbits/sec  145    376 KBytes
[  4]   2.00-3.00   sec   360 MBytes  3.02 Gbits/sec  148    454 KBytes
[  4]   3.00-4.00   sec   339 MBytes  2.84 Gbits/sec   83    407 KBytes
[  4]   4.00-5.00   sec   305 MBytes  2.56 Gbits/sec  104    414 KBytes
[  4]   5.00-6.00   sec   301 MBytes  2.53 Gbits/sec  186    440 KBytes
[  4]   6.00-7.00   sec   325 MBytes  2.73 Gbits/sec  174    485 KBytes
[  4]   7.00-8.00   sec   434 MBytes  3.64 Gbits/sec   81    677 KBytes
[  4]   8.00-9.00   sec   412 MBytes  3.46 Gbits/sec  226    537 KBytes
[  4]   9.00-10.00  sec   409 MBytes  3.43 Gbits/sec   47    372 KBytes
[  4]   10.00-11.00  sec   523 MBytes  3.81 Gbits/sec   96    422 KByte

timestamp, transfer_mbytesec, bandwidth_gbitsec, retr, cwnd_kbytes

In [65]:
#pd.read_csv('data/iperf.txt', sep='s+', skiprows=4)

In [69]:
with open('data/iperf.txt', 'r') as f:
    data = f.readlines()
    data = [line.strip() for line in data]

In [70]:
print(data)

['Wed Aug 15 19:35:11 CEST 2018', 'Connecting to host x.x.x.x, port 5201', '[  4] local x.x.x.x port 48944 connected to x.x.x.x port 5201', '[ ID] Interval           Transfer     Bandwidth       Retr  Cwnd', '[  4]   0.00-1.00   sec   375 MBytes  3.14 Gbits/sec  273    471 KBytes', '[  4]   1.00-2.00   sec   428 MBytes  3.59 Gbits/sec  145    376 KBytes', '[  4]   2.00-3.00   sec   360 MBytes  3.02 Gbits/sec  148    454 KBytes', '[  4]   3.00-4.00   sec   339 MBytes  2.84 Gbits/sec   83    407 KBytes', '[  4]   4.00-5.00   sec   305 MBytes  2.56 Gbits/sec  104    414 KBytes', '[  4]   5.00-6.00   sec   301 MBytes  2.53 Gbits/sec  186    440 KBytes', '[  4]   6.00-7.00   sec   325 MBytes  2.73 Gbits/sec  174    485 KBytes', '[  4]   7.00-8.00   sec   434 MBytes  3.64 Gbits/sec   81    677 KBytes', '[  4]   8.00-9.00   sec   412 MBytes  3.46 Gbits/sec  226    537 KBytes', '[  4]   9.00-10.00  sec   409 MBytes  3.43 Gbits/sec   47    372 KBytes', '[  4]   10.00-11.00  sec   523 MBytes  3.

In [74]:
import datetime

start_time = datetime.datetime.strptime(data[0], '%a %b %d %H:%M:%S CEST %Y')
print(start_time, type(start_time))

2018-08-15 19:35:11 <class 'datetime.datetime'>


In [100]:
rows = []

for line in data[4:]:
    line_splited = line.split()
    add_seconds = int(line_splited[2].split('.')[0])
    timestamp = start_time + datetime.timedelta(seconds=add_seconds)
    transfer_mbytesec = int(line_splited[4])
    bandwidth_gbitsec = float(line_splited[6])
    retr = int(line_splited[8])
    cwnd_kbytes = int(line_splited[9])
    rows.append((timestamp, transfer_mbytesec, bandwidth_gbitsec, retr, cwnd_kbytes))
    
print(rows)

[(datetime.datetime(2018, 8, 15, 19, 35, 11), 375, 3.14, 273, 471), (datetime.datetime(2018, 8, 15, 19, 35, 12), 428, 3.59, 145, 376), (datetime.datetime(2018, 8, 15, 19, 35, 13), 360, 3.02, 148, 454), (datetime.datetime(2018, 8, 15, 19, 35, 14), 339, 2.84, 83, 407), (datetime.datetime(2018, 8, 15, 19, 35, 15), 305, 2.56, 104, 414), (datetime.datetime(2018, 8, 15, 19, 35, 16), 301, 2.53, 186, 440), (datetime.datetime(2018, 8, 15, 19, 35, 17), 325, 2.73, 174, 485), (datetime.datetime(2018, 8, 15, 19, 35, 18), 434, 3.64, 81, 677), (datetime.datetime(2018, 8, 15, 19, 35, 19), 412, 3.46, 226, 537), (datetime.datetime(2018, 8, 15, 19, 35, 20), 409, 3.43, 47, 372), (datetime.datetime(2018, 8, 15, 19, 35, 21), 523, 3.81, 96, 422)]


In [85]:
import csv

In [101]:
headers = ['timestamp', 'transfer_mbytesec', 'bandwidth_gbitsec', 'retr', 'cwnd_kbytes']

with open('data/iperf_clean.csv', 'w') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(rows)

In [103]:
!head -n 20 data/iperf_clean.csv

timestamp,transfer_mbytesec,bandwidth_gbitsec,retr,cwnd_kbytes
2018-08-15 19:35:11,375,3.14,273,471
2018-08-15 19:35:12,428,3.59,145,376
2018-08-15 19:35:13,360,3.02,148,454
2018-08-15 19:35:14,339,2.84,83,407
2018-08-15 19:35:15,305,2.56,104,414
2018-08-15 19:35:16,301,2.53,186,440
2018-08-15 19:35:17,325,2.73,174,485
2018-08-15 19:35:18,434,3.64,81,677
2018-08-15 19:35:19,412,3.46,226,537
2018-08-15 19:35:20,409,3.43,47,372
2018-08-15 19:35:21,523,3.81,96,422


In [105]:
iperf_data = pd.read_csv('data/iperf_clean.csv', parse_dates=['timestamp'], index_col=['timestamp'])
iperf_data

Unnamed: 0_level_0,transfer_mbytesec,bandwidth_gbitsec,retr,cwnd_kbytes
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-08-15 19:35:11,375,3.14,273,471
2018-08-15 19:35:12,428,3.59,145,376
2018-08-15 19:35:13,360,3.02,148,454
2018-08-15 19:35:14,339,2.84,83,407
2018-08-15 19:35:15,305,2.56,104,414
2018-08-15 19:35:16,301,2.53,186,440
2018-08-15 19:35:17,325,2.73,174,485
2018-08-15 19:35:18,434,3.64,81,677
2018-08-15 19:35:19,412,3.46,226,537
2018-08-15 19:35:20,409,3.43,47,372


### Reading JSON files

In [13]:
a = {"a": 25, "vrednosti": (1,2,5,6,9), "polje": "to je string", "resnica": True, "nivrednosti": None, "nek_ict": {'a': 1, 'd':5}}

In [14]:
a

{'a': 25,
 'vrednosti': (1, 2, 5, 6, 9),
 'polje': 'to je string',
 'resnica': True,
 'nivrednosti': None,
 'nek_ict': {'a': 1, 'd': 5}}

In [15]:
import json

In [18]:
json_str = json.dumps(a)

In [19]:
type(json_str)

str

In [20]:
json_str

'{"a": 25, "vrednosti": [1, 2, 5, 6, 9], "polje": "to je string", "resnica": true, "nivrednosti": null, "nek_ict": {"a": 1, "d": 5}}'

In [22]:
json.loads(json_str)

{'a': 25,
 'vrednosti': [1, 2, 5, 6, 9],
 'polje': 'to je string',
 'resnica': True,
 'nivrednosti': None,
 'nek_ict': {'a': 1, 'd': 5}}

In [1]:
import pandas as pd
import numpy as np

#### Orient options

In [23]:
dfjo = pd.DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), columns=list('ABC'), index=list('xyz'))

In [24]:
dfjo

Unnamed: 0,A,B,C
x,1,4,7
y,2,5,8
z,3,6,9


<table class="colwidths-given table">
<colgroup>
<col style="width: 12%">
<col style="width: 88%">
</colgroup>
<tbody>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">split</span></code></p></td>
<td><p>dict like {index -&gt; [index], columns -&gt; [columns], data -&gt; [values]}</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">records</span></code></p></td>
<td><p>list like [{column -&gt; value}, … , {column -&gt; value}]</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">index</span></code></p></td>
<td><p>dict like {index -&gt; {column -&gt; value}}</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">columns</span></code></p></td>
<td><p>dict like {column -&gt; {index -&gt; value}}</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">values</span></code></p></td>
<td><p>just the values array</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">table</span></code></p></td>
<td><p>adhering to the JSON <a class="reference external" href="https://specs.frictionlessdata.io/json-table-schema/">Table Schema</a></p></td>
</tr>
</tbody>
</table>

In [28]:
# dict like {column -> {index -> value}}
dfjo.to_json(orient="columns")

'{"A":{"x":1,"y":2,"z":3},"B":{"x":4,"y":5,"z":6},"C":{"x":7,"y":8,"z":9}}'

In [27]:
#dict like {index -> {column -> value}}
dfjo.to_json(orient="index")

'{"x":{"A":1,"B":4,"C":7},"y":{"A":2,"B":5,"C":8},"z":{"A":3,"B":6,"C":9}}'

In [26]:
# list like [{column -> value}, … , {column -> value}]
dfjo.to_json(orient="records")

'[{"A":1,"B":4,"C":7},{"A":2,"B":5,"C":8},{"A":3,"B":6,"C":9}]'

In [29]:
# just the values array
dfjo.to_json(orient="values")

'[[1,4,7],[2,5,8],[3,6,9]]'

In [25]:
# dict like {index -> [index], columns -> [columns], data -> [values]}
dfjo.to_json(orient="split")

'{"columns":["A","B","C"],"index":["x","y","z"],"data":[[1,4,7],[2,5,8],[3,6,9]]}'

In [69]:
dfjo.to_json(orient="table")

'{"schema": {"fields":[{"name":"index","type":"string"},{"name":"A","type":"integer"},{"name":"B","type":"integer"},{"name":"C","type":"integer"}],"primaryKey":["index"],"pandas_version":"0.20.0"}, "data": [{"index":"x","A":1,"B":4,"C":7},{"index":"y","A":2,"B":5,"C":8},{"index":"z","A":3,"B":6,"C":9}]}'

#### Primer: ocenas.json

In [30]:
! head -n 10 data/ocenas.json

{"description":{"title":"Global Land and Ocean Temperature Anomalies, January-December","units":"Degrees Celsius","base_period":"1901-2000","missing":-999},"data":{"1880":"-0.12","1881":"-0.09","1882":"-0.10","1883":"-0.18","1884":"-0.27","1885":"-0.25","1886":"-0.25","1887":"-0.29","1888":"-0.13","1889":"-0.09","1890":"-0.35","1891":"-0.26","1892":"-0.31","1893":"-0.33","1894":"-0.31","1895":"-0.24","1896":"-0.09","1897":"-0.10","1898":"-0.27","1899":"-0.15","1900":"-0.07","1901":"-0.15","1902":"-0.25","1903":"-0.37","1904":"-0.45","1905":"-0.27","1906":"-0.20","1907":"-0.38","1908":"-0.43","1909":"-0.44","1910":"-0.40","1911":"-0.44","1912":"-0.33","1913":"-0.32","1914":"-0.14","1915":"-0.09","1916":"-0.32","1917":"-0.40","1918":"-0.30","1919":"-0.25","1920":"-0.23","1921":"-0.16","1922":"-0.24","1923":"-0.25","1924":"-0.24","1925":"-0.18","1926":"-0.08","1927":"-0.17","1928":"-0.18","1929":"-0.33","1930":"-0.11","1931":"-0.06","1932":"-0.13","1933":"-0.26","1934":"-0.11","1935":"-0.

In [43]:
oceans = pd.read_json('data/ocenas.json', orient='column')
oceans.drop(columns='description', inplace=True)
oceans.drop(['title', 'units', 'base_period', 'missing'], inplace=True)
oceans.index.name = 'year'
oceans.rename(columns={'data':'temp_anomaly_celsius'}, inplace=True)
oceans.head()

Unnamed: 0_level_0,temp_anomaly_celsius
year,Unnamed: 1_level_1
1880,-0.12
1881,-0.09
1882,-0.1
1883,-0.18
1884,-0.27


#### Primer: temperatures.json

In [45]:
#!cat ./data/temperatures.json 

In [48]:
import json

with open('data/temperatures.json') as f:
    d = json.load(f)

In [52]:
#d.get('data')

In [53]:
temp_json = json.dumps(d['data'])

In [56]:
temps = pd.read_json(temp_json, orient='index')
temps.head(10)

Unnamed: 0,anomaly,value
189512,-1.68,50.34
189612,-0.03,51.99
189712,-0.46,51.56
189812,-0.59,51.43
189912,-1.01,51.01
190012,0.75,52.77
190112,-0.15,51.87
190212,-0.43,51.59
190312,-1.4,50.62
190412,-0.86,51.16


#### Primer: cities.json

In [57]:
!head -n 5 data/cities.json

[{"name":"Aachen","id":"1","nametype":"Valid","recclass":"L5","mass":"21","fall":"Fell","year":"1880-01-01T00:00:00.000","reclat":"50.775000","reclong":"6.083330","geolocation":{"type":"Point","coordinates":[6.08333,50.775]}}
,{"name":"Aarhus","id":"2","nametype":"Valid","recclass":"H6","mass":"720","fall":"Fell","year":"1951-01-01T00:00:00.000","reclat":"56.183330","reclong":"10.233330","geolocation":{"type":"Point","coordinates":[10.23333,56.18333]}}
,{"name":"Abee","id":"6","nametype":"Valid","recclass":"EH4","mass":"107000","fall":"Fell","year":"1952-01-01T00:00:00.000","reclat":"54.216670","reclong":"-113.000000","geolocation":{"type":"Point","coordinates":[-113,54.21667]}}
,{"name":"Acapulco","id":"10","nametype":"Valid","recclass":"Acapulcoite","mass":"1914","fall":"Fell","year":"1976-01-01T00:00:00.000","reclat":"16.883330","reclong":"-99.900000","geolocation":{"type":"Point","coordinates":[-99.9,16.88333]}}
,{"name":"Achiras","id":"370","nametype":"Valid","recclass":"L6","

In [60]:
#  porblem nestanega jsona
cities = pd.read_json('data/cities.json', orient='records')
cities.head(2)

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation,:@computed_region_cbhk_fwbd,:@computed_region_nnqa_25f4
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01T00:00:00.000,50.775,6.08333,"{'type': 'Point', 'coordinates': [6.08333, 50....",,
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01T00:00:00.000,56.18333,10.23333,"{'type': 'Point', 'coordinates': [10.23333, 56...",,


In [61]:
with open('data/cities.json') as f:
    d = json.load(f)

In [63]:
# pred verzijo 1
from pandas.io.json import json_normalize
cities = json_normalize(d)

# nova verzija 1 naprej
cities = pd.json_normalize(d)

In [78]:
cities = json_normalize(d)

In [79]:
cities['coordinate_x'] = cities['geolocation.coordinates'].str[0]
cities['coordinate_y'] = cities['geolocation.coordinates'].str[1]
cities.drop(columns=['geolocation.coordinates', ':@computed_region_cbhk_fwbd', ':@computed_region_nnqa_25f4'], inplace=True)
cities.set_index('name', inplace=True)
cities

Unnamed: 0_level_0,id,nametype,recclass,mass,fall,year,reclat,reclong,geolocation.type,coordinate_x,coordinate_y
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Aachen,1,Valid,L5,21,Fell,1880-01-01T00:00:00.000,50.775000,6.083330,Point,6.08333,50.77500
Aarhus,2,Valid,H6,720,Fell,1951-01-01T00:00:00.000,56.183330,10.233330,Point,10.23333,56.18333
Abee,6,Valid,EH4,107000,Fell,1952-01-01T00:00:00.000,54.216670,-113.000000,Point,-113.00000,54.21667
Acapulco,10,Valid,Acapulcoite,1914,Fell,1976-01-01T00:00:00.000,16.883330,-99.900000,Point,-99.90000,16.88333
Achiras,370,Valid,L6,780,Fell,1902-01-01T00:00:00.000,-33.166670,-64.950000,Point,-64.95000,-33.16667
...,...,...,...,...,...,...,...,...,...,...,...
Tirupati,24009,Valid,H6,230,Fell,1934-01-01T00:00:00.000,13.633330,79.416670,Point,79.41667,13.63333
Tissint,54823,Valid,Martian (shergottite),7000,Fell,2011-01-01T00:00:00.000,29.481950,-7.611230,Point,-7.61123,29.48195
Tjabe,24011,Valid,H6,20000,Fell,1869-01-01T00:00:00.000,-7.083330,111.533330,Point,111.53333,-7.08333
Tjerebon,24012,Valid,L5,16500,Fell,1922-01-01T00:00:00.000,-6.666670,106.583330,Point,106.58333,-6.66667


In [85]:
geojsondict = { "type": "FeatureCollection",
  "features": [
    { "type": "Feature",
      "geometry": {"type": "Point", "coordinates": [102.0, 0.5]},
      "properties": {"prop0": "value0"}
      },
    { "type": "Feature",
      "geometry": {
        "type": "LineString",
        "coordinates": [
          [102.0, 0.0], [103.0, 1.0], [104.0, 0.0], [105.0, 1.0]
          ]
        },
      "properties": {
        "prop0": "value0",
        "prop1": 0.0
        }
      },
    { "type": "Feature",
       "geometry": {
         "type": "Polygon",
         "coordinates": [
           [ [100.0, 0.0], [101.0, 0.0], [101.0, 1.0],
             [100.0, 1.0], [100.0, 0.0] ]
           ]

       },
       "properties": {
         "prop0": "value0",
         "prop1": {"this": "that"}
         }
       }
    ]
  }

#### Primer: transactions.json

In [97]:
!head -n 40 data/transactions.json

{
"txs":[

{
   "lock_time":0,
   "ver":1,
   "size":373,
   "inputs":[
      {
         "sequence":4294967295,
         "witness":"",
         "prev_out":{
            "spent":true,
            "spending_outpoints":[
               {
                  "tx_index":0,
                  "n":0
               }
            ],
            "tx_index":0,
            "type":0,
            "addr":"1JhGpz2ZiQrwudq7VaFZXxej2NZzMLgcsq",
            "value":1353992,
            "n":0,
            "script":"76a914c217fd1f3d8bfb182d8d9e01ef024ee3b76a9c3a88ac"
         },
         "script":"483045022100a0b9171e0c645048641a78533440fcfac20eea27cc7144d07912d07792ace49e022036466f99f9604db9f57f46dd69821c8744ad29c80ecdbfa94cf745620bc97fbe012102643b9cbfe6cdbd791f9360dea62bfb9a58a3127a4434c08ee9d9528ad8c393f4"
      },
      {
         "sequence":4294967295,
         "witness":"",
         "prev_out":{
            "spent":true,
            "spending_outpoints":[
              

In [98]:
with open('data/transactions.json') as f:
    data = json.load(f)

In [103]:
#data

In [102]:
json_normalize(data['txs'], record_path=['out'], meta=['time', 'hash'])

Unnamed: 0,spent,tx_index,type,addr,value,n,script,time,hash
0,False,0,0,1H7r57SXAwaKs3Tf5ugbkRNxwfh9YaxC5b,7541,0,76a914b0cd787a7a879ac0a5277b0013ec7b11c145055d...,1586376721,0f06714015f334626a168ee3e0aa5e0d3866a33dad504b...
1,False,0,0,1BPULhbGfrojrknyD7aZYMtRVUu38Cn75j,1364400,1,76a91471f13b222426eb80b47d2413d21a8904ec1966b2...,1586376721,0f06714015f334626a168ee3e0aa5e0d3866a33dad504b...
2,False,0,0,1LQ6YURobx4EGZRp8bdEDHup6T56o5NGKN,3127836,0,76a914d4c895721d3a8cd74bb3ccbb699a3dbe342c0807...,1586376722,3684072a50d7389933210d7adf4f98640d3d53c8cb245e...
3,False,0,0,1HSLVVSSQmzaNG8sbakhFDrmpzUPZLnYCe,30036732,1,76a914b44cae99837337275d21d2c5c6ed6cddf7a7e9f7...,1586376722,3684072a50d7389933210d7adf4f98640d3d53c8cb245e...
4,False,0,0,3Lb2MJWbBE88BUHf6tAw8ZzhkR6H2cYRhR,206183,0,a914cf48401e3cf81080352f281ea859ccabd51a821487,1586376721,3d3cc141654170060a7e298a9e5298557970e8cd0051ab...
...,...,...,...,...,...,...,...,...,...
103,False,0,0,,2867,1,001454e66e303423a961a7412bb8e21c6b4816d9b75c,1586376707,eeb7b6b99f5e70bca4c8f4175bb08ced8a90472455a540...
104,False,0,0,3L7wHJ9pMv4SxBDjZgz37STS9Eayp2mio2,73490,0,a914ca28c9bd4f589cb4d8fd85cc952ffac6062c754687,1586376707,282dba81a2a0a80f71550aff8f38b64b95cb5b12bc2ac6...
105,False,0,0,3Fda8xgadEmxm9v3zsRVmkfUqiuS2GJ39T,748183,1,a91498eafcf39be707e9f61260d59683da3b00bf095287,1586376707,282dba81a2a0a80f71550aff8f38b64b95cb5b12bc2ac6...
106,False,0,0,14k1yuqbSetT18Eqke1EmxExLvJEGkDN9a,750000,0,76a914290b1531ea309af7e35633aa6c7a6970d24eb87e...,1586376708,a6882f479e9f176f63e5187ca6ab5384da40ebc78a305d...


#### Primer: all_hour_geo.json

In [107]:
#!head -5 data/all_hour_geo.json

In [124]:
with open('data/all_hour_geo.json') as f:
    data = json.load(f)

In [125]:
data_parsed = [element['properties'] for element in data['features']]

In [128]:
hour_geo = json_normalize(data_parsed)
hour_geo.head(3)

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,ids,sources,types,nst,dmin,rms,gap,magType,type,title
0,0.69,"16km ESE of Anza, CA",1586352802900,1586353032308,-480,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/earthquakes/feed/v...,,,,...,",ci39143639,",",ci,",",geoserve,nearby-cities,origin,phase-data,scit...",12.0,0.05468,0.14,98.0,ml,earthquake,"M 0.7 - 16km ESE of Anza, CA"
1,2.34,"7km ENE of Pahala, Hawaii",1586352794640,1586353127910,-600,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/earthquakes/feed/v...,,,,...,",hv71464377,",",hv,",",geoserve,origin,phase-data,",49.0,0.02127,0.13,136.0,ml,earthquake,"M 2.3 - 7km ENE of Pahala, Hawaii"
2,0.85,"15km ESE of Anza, CA",1586352704490,1586352926133,-480,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/earthquakes/feed/v...,,,,...,",ci39143631,",",ci,",",geoserve,nearby-cities,origin,phase-data,scit...",30.0,0.04884,0.14,55.0,ml,earthquake,"M 0.9 - 15km ESE of Anza, CA"


#### Primer: rates.json

In [129]:
#load json object
with open('data/rates.json') as f:
    d = json.load(f)

In [131]:
#d

### Python Pickle Format

In [132]:
# pripravimo datoteko za pisnaje v pickle format
titanic = pd.read_csv('data/titanic_sub.csv', 
                     index_col='PassengerId',
                     usecols=['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Embarked'])

In [133]:
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,22.0,7.25,,S
2,1,1,female,38.0,71.2833,C85,C
3,1,3,female,26.0,7.925,,S
4,1,1,female,35.0,53.1,C123,S
5,0,3,male,35.0,8.05,,S


In [134]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(2), object(3)
memory usage: 55.7+ KB


In [137]:
titanic.to_pickle('data/titanic_sub.pkl')

In [138]:
titanic_read = pd.read_pickle('data/titanic_sub.pkl')

In [139]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(2), object(3)
memory usage: 55.7+ KB


In [140]:
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,22.0,7.25,,S
2,1,1,female,38.0,71.2833,C85,C
3,1,3,female,26.0,7.925,,S
4,1,1,female,35.0,53.1,C123,S
5,0,3,male,35.0,8.05,,S


### Excel files

In [141]:
file = 'data/battledeath.xlsx'

In [142]:
xls = pd.ExcelFile(file)

In [143]:
xls.sheet_names

['2002', '2004']

xls, xlsx, xlsm, xlsb, and odf

In [144]:
df_2002 = pd.read_excel(xls, '2002')

In [146]:
df_2002.head()

Unnamed: 0,"War, age-adjusted mortality due to",2002
0,Afghanistan,36.08399
1,Albania,0.128908
2,Algeria,18.31412
3,Andorra,0.0
4,Angola,18.96456


In [156]:
with pd.ExcelFile(file) as exel_file:
    df_2002 = pd.read_excel(xls, '2002', names=['country', 'war_2002'], index_col='country')
    df_2004 = pd.read_excel(xls, '2004')

In [157]:
df_2002.head()

Unnamed: 0_level_0,war_2002
country,Unnamed: 1_level_1
Afghanistan,36.08399
Albania,0.128908
Algeria,18.31412
Andorra,0.0
Angola,18.96456


In [158]:
df_2004.head()

Unnamed: 0,War(country),2004
0,Afghanistan,9.451028
1,Albania,0.130354
2,Algeria,3.407277
3,Andorra,0.0
4,Angola,2.597931
