# Python training for data engineers
## 07 Data Enriching

### Goal
Manipulating a dataframe to create a new one
* `reset_index`
* `pivot`

### XML

In [1]:
import pandas as pd
xmldf = pd.read_pickle('xml_dataframe_notebook_05.pickle')

In [2]:
xmldf

Unnamed: 0,file_type,filename_size_hash,package_name,python_version,uploaded_on,filename,size,unit,size_in_bytes
0,wheel,scikit_learn-0.19.1-cp27-cp27m-macosx_10_6_int...,scikit-learn,cp27,2017-10-23,scikit_learn-0.19.1-cp27-cp27m-macosx_10_6_int...,0,MB,0
1,wheel,scikit_learn-0.19.1-cp27-cp27m-manylinux1_i686...,scikit-learn,cp27,2017-10-23,scikit_learn-0.19.1-cp27-cp27m-manylinux1_i686...,4,MB,4194304
2,wheel,scikit_learn-0.19.1-cp27-cp27m-manylinux1_x86_...,scikit-learn,cp27,2017-10-23,scikit_learn-0.19.1-cp27-cp27m-manylinux1_x86_...,2,MB,2097152
3,wheel,scikit_learn-0.19.1-cp27-cp27mu-manylinux1_i68...,scikit-learn,cp27,2017-10-23,scikit_learn-0.19.1-cp27-cp27mu-manylinux1_i68...,4,MB,4194304
4,wheel,scikit_learn-0.19.1-cp27-cp27mu-manylinux1_x86...,scikit-learn,cp27,2017-10-23,scikit_learn-0.19.1-cp27-cp27mu-manylinux1_x86...,2,MB,2097152
5,wheel,scikit_learn-0.19.1-cp27-cp27m-win32.whl ...,scikit-learn,cp27,2017-10-23,scikit_learn-0.19.1-cp27-cp27m-win32.whl ...,1,MB,1048576
6,wheel,scikit_learn-0.19.1-cp27-cp27m-win_amd64.whl ...,scikit-learn,cp27,2017-10-23,scikit_learn-0.19.1-cp27-cp27m-win_amd64.whl ...,5,MB,5242880
7,wheel,scikit_learn-0.19.1-cp34-cp34m-macosx_10_6_int...,scikit-learn,cp34,2017-10-23,scikit_learn-0.19.1-cp34-cp34m-macosx_10_6_int...,5,MB,5242880
8,wheel,scikit_learn-0.19.1-cp34-cp34m-manylinux1_i686...,scikit-learn,cp34,2017-10-23,scikit_learn-0.19.1-cp34-cp34m-manylinux1_i686...,6,MB,6291456
9,wheel,scikit_learn-0.19.1-cp34-cp34m-manylinux1_x86_...,scikit-learn,cp34,2017-10-23,scikit_learn-0.19.1-cp34-cp34m-manylinux1_x86_...,4,MB,4194304


Create new dataframe from the grouped data:

In [3]:
subdf = xmldf.groupby(['python_version', 'file_type'])['package_name'].count()
print(type(subdf))
subdf

<class 'pandas.core.series.Series'>


python_version  file_type        
2.7             wheel                 1
                windows installer     2
3.5             wheel                 1
                windows installer     1
None            source                1
cp27            wheel                21
cp33            wheel                 5
cp34            wheel                10
cp35            wheel                15
cp36            wheel                14
py2.py3         wheel                 4
py3             wheel                 1
Name: package_name, dtype: int64

Convert the Series to a Dataframe:

In [4]:
subdf = subdf.to_frame().rename(columns={'package_name':'count'})
subdf

Unnamed: 0_level_0,Unnamed: 1_level_0,count
python_version,file_type,Unnamed: 2_level_1
2.7,wheel,1
2.7,windows installer,2
3.5,wheel,1
3.5,windows installer,1
,source,1
cp27,wheel,21
cp33,wheel,5
cp34,wheel,10
cp35,wheel,15
cp36,wheel,14


Create columns from the index:

In [5]:
subdf = subdf.reset_index()
subdf

Unnamed: 0,python_version,file_type,count
0,2.7,wheel,1
1,2.7,windows installer,2
2,3.5,wheel,1
3,3.5,windows installer,1
4,,source,1
5,cp27,wheel,21
6,cp33,wheel,5
7,cp34,wheel,10
8,cp35,wheel,15
9,cp36,wheel,14


## Pivoting
Pivot the table to get _release type per Python version_

In [6]:
subdf = subdf.pivot(index='python_version', columns='file_type', values='count')
subdf

file_type,source,wheel,windows installer
python_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.7,,1.0,2.0
3.5,,1.0,1.0
,1.0,,
cp27,,21.0,
cp33,,5.0,
cp34,,10.0,
cp35,,15.0,
cp36,,14.0,
py2.py3,,4.0,
py3,,1.0,


Fill the NaNs

In [7]:
subdf = subdf.fillna(0)
subdf

file_type,source,wheel,windows installer
python_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.7,0.0,1.0,2.0
3.5,0.0,1.0,1.0
,1.0,0.0,0.0
cp27,0.0,21.0,0.0
cp33,0.0,5.0,0.0
cp34,0.0,10.0,0.0
cp35,0.0,15.0,0.0
cp36,0.0,14.0,0.0
py2.py3,0.0,4.0,0.0
py3,0.0,1.0,0.0


### API

In [8]:
jsondf = pd.read_pickle('json_dataframe_notebook_05.pickle')

Calculate the maximum for each row:

In [9]:
max_row = {col: jsondf[col].max() for col in jsondf}
max_df = pd.DataFrame(max_row, index=["rowMax"])
jsondf = jsondf.append(max_df)
jsondf

Unnamed: 0,.htaccess,.net,2d,3d,actionscript-3,aggregate,ajax,alembic,algorithm,amazon-web-services,...,xgboost,xml,xml-parsing,xml-serialization,xpath,xquery,xsd,xslt,xslt-1.0,xslt-2.0
algorithm,0.0,678.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arrays,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4962.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
beautifulsoup,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,305.0,61.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0
c++,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8948.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
class,0.0,618.0,0.0,0.0,543.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,966.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dataframe,0.0,0.0,0.0,0.0,0.0,261.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
datetime,0.0,1708.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dictionary,0.0,885.0,0.0,0.0,0.0,0.0,0.0,0.0,450.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
django,0.0,0.0,0.0,0.0,0.0,0.0,3209.0,0.0,0.0,939.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Select the top 50 rows and trim the dataframe.

In [10]:
topColumns = jsondf.loc['rowMax'].nlargest(50)
topdf = jsondf[topColumns.keys()]

In [11]:
topdf = topdf.drop(['rowMax'])

In [12]:
topdf

Unnamed: 0,jquery,css,html,javascript,php,angularjs,sql,django,python,ajax,...,angular,django-models,dom,xslt,google-maps,windows,flask,shell,tkinter,pointers
algorithm,0.0,0.0,0.0,3038.0,1928.0,0.0,0.0,0.0,6488.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arrays,11147.0,0.0,4178.0,44269.0,58235.0,3431.0,1970.0,0.0,15563.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7510.0
beautifulsoup,0.0,98.0,2172.0,217.0,0.0,0.0,0.0,123.0,10477.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c++,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6668.0,0.0,...,0.0,0.0,0.0,0.0,0.0,17264.0,0.0,0.0,0.0,16916.0
class,1769.0,1104.0,1104.0,2764.0,6235.0,0.0,0.0,0.0,7689.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,949.0
csv,599.0,0.0,766.0,2317.0,5761.0,0.0,1494.0,432.0,15832.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,478.0,0.0,0.0
dataframe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14821.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
datetime,869.0,0.0,0.0,3124.0,5927.0,303.0,3428.0,808.0,6612.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dictionary,386.0,0.0,0.0,1669.0,0.0,0.0,0.0,460.0,21991.0,0.0,...,0.0,0.0,0.0,0.0,510.0,0.0,0.0,0.0,0.0,0.0
django,4460.0,1415.0,4296.0,5459.0,0.0,1113.0,1704.0,0.0,86864.0,3209.0,...,0.0,18578.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
topdf.to_pickle('json_dataframe_notebook_07.pickle')