# Python training for data engineers
## 07 Data Enriching

### XML

In [5]:
import pandas as pd
xmldf = pd.read_pickle('xml_dataframe_notebook_05.pickle')

In [9]:
xmldf

Unnamed: 0,filename,package_name,python_version,release_type,size,uploaded_on,unit,size_in_bytes
0,watson_machine_learning_client-1.0.83-py3-none...,watson-machine-learning-client,py3,python wheel,552,2018-04-10,KB,565248
1,watson_machine_learning_client-1.0.83.tar.gz ...,watson-machine-learning-client,,source,211,2018-04-10,KB,216064
2,azure-mgmt-machinelearningcompute-0.4.0.zip ...,azure-mgmt-machinelearningcompute,,source,50,2018-01-02,KB,51200
3,azure_mgmt_machinelearningcompute-0.4.0-py2.py...,azure-mgmt-machinelearningcompute,py2.py3,python wheel,38,2018-01-02,KB,38912
4,machineLearningStanford-0.0.tar.gz (md5),machineLearningStanford,,source,3,2015-07-08,KB,3072
5,pylearning-3.2.2b1-py3-none-any.whl (md5),pylearning,py3,python wheel,19,2017-07-02,KB,19456
6,Etherscan Magic for Machine Learning and Bash-...,Etherscan-Magic-for-Machine-Learning-and-Bash,,source,4,2018-04-07,KB,4096
7,oolearning-0.1.19.tar.gz (md5),oolearning,,source,94,2018-03-15,KB,96256
8,Augmentor-0.2.0-py2.py3-none-any.whl (md5),Augmentor,py2.py3,python wheel,34,2018-03-01,KB,34816
9,Augmentor-0.2.0.tar.gz (md5),Augmentor,,source,36,2018-03-01,KB,36864


Create new dataframe from the grouped data:

In [31]:
subdf = xmldf.groupby(['python_version', 'release_type'])['package_name'].count()
print(type(subdf))
subdf

<class 'pandas.core.series.Series'>


python_version                                       release_type        
                                                     source                  1064
2.4                                                  python egg                 1
2.6                                                  python egg                 1
2.7                                                  ms windows installer       2
                                                     python egg                23
                                                     python wheel              31
3.2                                                  python egg                 1
3.4                                                  python egg                 5
                                                     python wheel               5
3.5                                                  ms windows installer       3
                                                     python egg                 7
                        

Convert the Series to a Dataframe:

In [32]:
subdf = subdf.to_frame().rename(columns={'package_name':'count'})
subdf

Unnamed: 0_level_0,Unnamed: 1_level_0,count
python_version,release_type,Unnamed: 2_level_1
,source,1064
2.4,python egg,1
2.6,python egg,1
2.7,ms windows installer,2
2.7,python egg,23
2.7,python wheel,31
3.2,python egg,1
3.4,python egg,5
3.4,python wheel,5
3.5,ms windows installer,3


Create columns from the index:

In [33]:
subdf = subdf.reset_index()
subdf

Unnamed: 0,python_version,release_type,count
0,,source,1064
1,2.4,python egg,1
2,2.6,python egg,1
3,2.7,ms windows installer,2
4,2.7,python egg,23
5,2.7,python wheel,31
6,3.2,python egg,1
7,3.4,python egg,5
8,3.4,python wheel,5
9,3.5,ms windows installer,3


## Pivoting
Pivot the table to get release type per Python version

In [34]:
subdf = subdf.pivot(index='python_version', columns='release_type', values='count')
subdf

release_type,"""dumb"" binary",ms windows installer,python egg,python wheel,source
python_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,,,,,1064.0
2.4,,,1.0,,
2.6,,,1.0,,
2.7,,2.0,23.0,31.0,
3.2,,,1.0,,
3.4,,,5.0,5.0,
3.5,,3.0,7.0,11.0,
3.6,,1.0,8.0,23.0,
any,1.0,1.0,,,
cp27,,,,70.0,


Fill the NaNs

In [35]:
subdf = subdf.fillna(0)
subdf

release_type,"""dumb"" binary",ms windows installer,python egg,python wheel,source
python_version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,0.0,0.0,0.0,0.0,1064.0
2.4,0.0,0.0,1.0,0.0,0.0
2.6,0.0,0.0,1.0,0.0,0.0
2.7,0.0,2.0,23.0,31.0,0.0
3.2,0.0,0.0,1.0,0.0,0.0
3.4,0.0,0.0,5.0,5.0,0.0
3.5,0.0,3.0,7.0,11.0,0.0
3.6,0.0,1.0,8.0,23.0,0.0
any,1.0,1.0,0.0,0.0,0.0
cp27,0.0,0.0,0.0,70.0,0.0


### API

In [13]:
jsondf = pd.read_pickle('json_dataframe_notebook_05.pickle')

Calculate the maximum for each row:

In [14]:
max_row = {col: jsondf[col].max() for col in jsondf}
max_df = pd.DataFrame(max_row, index=["rowMax"])
jsondf = jsondf.append(max_df)
jsondf

Unnamed: 0,.htaccess,.net,2d,3d,actionscript-3,aggregate,ajax,alembic,algorithm,amazon-web-services,...,xgboost,xml,xml-parsing,xml-serialization,xpath,xquery,xsd,xslt,xslt-1.0,xslt-2.0
algorithm,0.0,679.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arrays,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4945.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
beautifulsoup,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,299.0,61.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0
c++,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8935.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
class,0.0,615.0,0.0,0.0,542.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
csv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,961.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dataframe,0.0,0.0,0.0,0.0,0.0,258.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
datetime,0.0,1707.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dictionary,0.0,880.0,0.0,0.0,0.0,0.0,0.0,0.0,449.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
django,0.0,0.0,0.0,0.0,0.0,0.0,3184.0,0.0,0.0,924.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Select the top 50 rows and trim the dataframe.

In [15]:
topColumns = jsondf.loc['rowMax'].nlargest(50)
topdf = jsondf[topColumns.keys()]

In [16]:
topdf = topdf.drop(['rowMax'])

In [17]:
topdf

Unnamed: 0,jquery,css,html,javascript,php,angularjs,sql,python,django,ajax,...,django-models,dom,angular,xslt,google-maps,windows,flask,shell,tkinter,pointers
algorithm,0.0,0.0,0.0,3011.0,1928.0,0.0,0.0,6460.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arrays,11114.0,0.0,4149.0,43978.0,58047.0,3426.0,1964.0,15445.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7482.0
beautifulsoup,0.0,98.0,2163.0,216.0,0.0,0.0,0.0,10390.0,123.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c++,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6646.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,17182.0,0.0,0.0,0.0,16863.0
class,1764.0,1097.0,1101.0,2736.0,6239.0,0.0,0.0,7647.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,950.0
csv,599.0,0.0,757.0,2306.0,5744.0,0.0,1483.0,15700.0,429.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,479.0,0.0,0.0
dataframe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14471.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
datetime,867.0,0.0,0.0,3106.0,5912.0,300.0,3419.0,6549.0,803.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dictionary,386.0,0.0,0.0,1656.0,0.0,0.0,0.0,21792.0,454.0,0.0,...,0.0,0.0,0.0,0.0,508.0,0.0,0.0,0.0,0.0,0.0
django,4447.0,1407.0,4258.0,5439.0,0.0,1112.0,1698.0,86426.0,0.0,3184.0,...,18447.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
topdf.to_pickle('json_dataframe_notebook_07.pickle')