In [14]:
import pandas as pd

In [15]:
df = pd.read_stata("rwe_data.dta")
df.columns

Index(['caseID', 'date', 'state', 'state1', 'state2', 'location', 'location1',
       'location2', 'location3', 'location4', 'lon1', 'lat1', 'lon2', 'lat2',
       'lon3', 'lat3', 'lon4', 'lat4', 'district', 'participants', 'organizer',
       'classification', 'ideology', 'motto', 'source'],
      dtype='object')

In [16]:
def convert_state_abbreviations(data_series):
    state_map = {
        'NW': 'Nordrhein-Westfalen',
        'TH': 'Thüringen',
        'SN': 'Sachsen',
        'BY': 'Bayern',
        'BE': 'Berlin',
        'ST': 'Sachsen-Anhalt',
        'BB': 'Brandenburg',
        'MV': 'Mecklenburg-Vorpommern',
        'RP': 'Rheinland-Pfalz',
        'BW': 'Baden-Württemberg',
        'NI': 'Niedersachsen',
        'HE': 'Hessen',
        'SH': 'Schleswig-Holstein',
        'HH': 'Hamburg',
        'SL': 'Saarland',
        'HB': 'Bremen',
        'RP/SL': 'Rheinland-Pfalz/Saarland'
    }
    
    return data_series.map(state_map)

df.state = convert_state_abbreviations(df.state)
df.state.value_counts()

Nordrhein-Westfalen         512
Thüringen                   449
Sachsen                     422
Bayern                      354
Berlin                      329
Sachsen-Anhalt              301
Brandenburg                 222
Mecklenburg-Vorpommern      182
Rheinland-Pfalz             145
Baden-Württemberg           106
Niedersachsen               101
Hessen                       52
Schleswig-Holstein           49
Hamburg                      34
Saarland                     23
Bremen                        7
Rheinland-Pfalz/Saarland      2
Name: state, dtype: int64

In [17]:
df['date'] = pd.to_datetime(df['date'])

df['syear'] = df['date'].dt.year

result = df.groupby(['syear', 'state']).agg(
    rwe_participants=pd.NamedAgg(column='participants', aggfunc='sum'),
    num_observations=pd.NamedAgg(column='caseID', aggfunc='count')
).reset_index()

print(result)


     syear                     state  total_participants  num_observations
0     2005            Sachsen-Anhalt              3865.0                19
1     2005       Nordrhein-Westfalen              3236.0                30
2     2005                    Berlin              5255.0                17
3     2005               Brandenburg              2570.0                 8
4     2005         Baden-Württemberg              1280.0                 9
..     ...                       ...                 ...               ...
267   2020  Rheinland-Pfalz/Saarland                 0.0                 0
268   2020        Schleswig-Holstein                 0.0                 0
269   2020                  Saarland                 0.0                 0
270   2020                   Sachsen              8615.0                15
271   2020                 Thüringen                30.0                 1

[272 rows x 4 columns]


In [18]:
result.to_parquet("rwe.gzip")