In [1]:
import pandas as pd
import numpy as np
import openpyxl

Prepare workbook for preprocessing

In [2]:
wb = openpyxl.load_workbook(filename='../data/mobility/raw_data/Güterverkehr der Binnenschifffahrt_monatlich.xlsx', 
                   read_only=True)

ws = wb['Güterverkehr der Binnenschifffa']

# Read the cell values into a list of lists
data_rows = []
for row in ws['A10':'K359']:
    data_cols = []
    for cell in row:
        data_cols.append(cell.value)
    data_rows.append(data_cols)

df = pd.DataFrame(data_rows)
#print(df.head())

#calculate a total value of transport performance [mil. t*km]
df[10] = df[3] + df[5] + df[7] + df[9]

#drop columns of uniportant data
df = df.drop(np.arange(2,10), axis=1)
#print(df.head())

#rename column names
df.columns = ['year', 'month', 'M_InlandShippingTotalTransportPerformance']
print(df.head())

   year    month  M_InlandShippingTotalTransportPerformance
0  1991   Januar                                       4815
1  1991  Februar                                       3335
2  1991     März                                       5370
3  1991    April                                       5183
4  1991      Mai                                       5306


Mapping dictionary for month name and number

In [3]:
In [3]: d = {'Januar':1, 'Februar':2, 'März':3, 'April':4, 'Mai':5, 'Juni':6, 'Juli':7, 'August':8, 
             'September':9, 'Oktober':10, 'November':11, 'Dezember':12, }
df.month = df.month.map(d)
print(df.head())

   year  month  M_InlandShippingTotalTransportPerformance
0  1991      1                                       4815
1  1991      2                                       3335
2  1991      3                                       5370
3  1991      4                                       5183
4  1991      5                                       5306


Combine year and month in one cell

In [4]:
#change datatype of year and month to change 2019.0 to 2019
df.year = df.year.astype('int16')
df.month = df.month.astype('int16')

#new column in format: yyyy-month as datatype string
df['date'] = df.year.astype('str') + '-' + df.month.astype('str')

#drop old columns of year and month
df = df.drop(['year', 'month'], axis=1)

#show df.head()
print(df.head())

   M_InlandShippingTotalTransportPerformance    date
0                                       4815  1991-1
1                                       3335  1991-2
2                                       5370  1991-3
3                                       5183  1991-4
4                                       5306  1991-5


Use the date column as index

In [5]:
#change of datatype of column date to datetime with monthly period
df.date = pd.to_datetime(df.date).dt.to_period('m')

#set index of dataframe to date
df = df.set_index('date')

#show df.head()
print(df.head())

         M_InlandShippingTotalTransportPerformance
date                                              
1991-01                                       4815
1991-02                                       3335
1991-03                                       5370
1991-04                                       5183
1991-05                                       5306


Test plot

In [6]:
df.M_InlandShippingTotalTransportPerformance.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x228ace06c08>

Save dateframe in csv format

In [7]:
df.to_csv('../data/mobility/Shipping_InlandFreightTransport.csv')