## Protester/Participant Type IDs
### Create standardized categories for type of protesters

#### Standard Imports & File Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [212]:
# Import original data from Mass Mobilization Project:
mass = pd.read_csv('./source/mmALL_073120_csv.csv')

#### File Cleaning and Preparation

In [4]:
# Remove observations in data that are not actually protests:
mass = mass[mass['protest']==1]
mass = mass.drop(columns=['protest'])

In [5]:
mass.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15239 entries, 0 to 17141
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     15239 non-null  int64  
 1   country                15239 non-null  object 
 2   ccode                  15239 non-null  int64  
 3   year                   15239 non-null  int64  
 4   region                 15239 non-null  object 
 5   protestnumber          15239 non-null  int64  
 6   startday               15239 non-null  float64
 7   startmonth             15239 non-null  float64
 8   startyear              15239 non-null  float64
 9   endday                 15239 non-null  float64
 10  endmonth               15239 non-null  float64
 11  endyear                15239 non-null  float64
 12  protesterviolence      15239 non-null  float64
 13  location               15218 non-null  object 
 14  participants_category  9887 non-null   object 
 15  pa

In [213]:
# Explore protesteridentity field before categorization:
mass['protesteridentity'].head(15)

0                           unspecified
1                           unspecified
2            separatist parti quebecois
3                        mohawk indians
4                       local residents
5                        mohawk indians
6     public service alliance of canada
7     public service alliance of canada
8       gangs of black and white youths
9                           unspecified
10            environmental activitists
11            gun owners and supporters
12                             students
13                          unspecified
14                     chippewa indians
Name: protesteridentity, dtype: object

In [8]:
# Look at top occuring protesteridentity values to identify trends, groups:
mass['protesteridentity'].value_counts(dropna=False).head(50)

protesters                                                                               1541
students                                                                                  646
NaN                                                                                       555
workers                                                                                   273
unspecified                                                                               224
farmers                                                                                   219
residents                                                                                 171
opposition supporters                                                                     165
university students                                                                       144
demonstrators                                                                             137
opposition                                                  

In [10]:
# Explore correlation to other fields in data:
mass[['country','id','participants','protesteridentity']].sort_values('id').head(50)

Unnamed: 0,country,id,participants,protesteridentity
0,Canada,201990001,1000s,unspecified
1,Canada,201990002,1000,unspecified
2,Canada,201990003,500,separatist parti quebecois
3,Canada,201990004,100s,mohawk indians
4,Canada,201990005,950,local residents
5,Canada,201990006,200,mohawk indians
6,Canada,201991001,110000,public service alliance of canada
7,Canada,201991002,110000,public service alliance of canada
8,Canada,201992001,1000,gangs of black and white youths
9,Canada,201993001,10000s,unspecified


#### Develop Dictionary & Functions for Category Development

In [175]:
# Develop dictionary that will be used to categorize key words in categories.
# Dictionary key is protester_id_type cateogry and value is list of words from protesteridentity
protester_category_dict = {'students_youth': ['student', 'youth', 'young', 'university', 'college', 'school'],
                      'workers_unions': ['worker', 
                                           'employee', 
                                           'union', 
                                           'labor', 
                                           'labour',
                                           'fishermen', 
                                           'miners', 
                                           'farmer', 
                                           'teacher', 
                                           'driver', 
                                           'journalist', 
                                           'doctors',
                                           'staff',
                                           'vendor',
                                           'businessmen', 
                                           'owner', 
                                           'commerce', 
                                           'trader', 
                                           'stuttgart 21',
                                           'civil servant',
                                           'public service',
                                           'nurse',
                                           'scientist',
                                           'grower',
                                           'artist',
                                           'architech'],
                      'ethnic_group': ['indian',  
                                       'kurd', 
                                       'ethnic', 
                                       'native', 
                                       'aboriginal', 
                                       'hazaras', 
                                       'sunni', 
                                       'shiite', 
                                       'azerbaijani', 
                                       'soviet',
                                       'tribe', 
                                       'indigenous'],
                      'soldiers_veterans': ['soldier', 'veteran', 'military'],
                      'political_group': ['communist', 
                                          'democracy', 
                                          'democratic', 
                                          'leftist', 
                                          'wing', 
                                          'loyalist', 
                                          'separatist', 
                                          'party', 
                                          'nationalist', 
                                          'politician',
                                          'pegida', 
                                          'secessionist', 
                                          'awami league',
                                          'independence',
                                          'marcha verde',
                                          'sandinista',
                                          'hizbullah',
                                          'politician',
                                          'election'],
                      'religious_group': ['orthodox', 
                                          'priest', 
                                          'faith', 
                                          'clergymen', 
                                          'clergy', 
                                          'evangelicals', 
                                          'muslim', 
                                          'church', 
                                          'christian', 
                                          'jew', 
                                          'cult', 
                                          'buddhist', 
                                          'monk', 
                                          'catholic', 
                                          'religious', 
                                          'nuns'],
                      'women': ['women', 'ladies', 'housewives', 'wives'],
                      'civil_human_rights': ['civil rights', 'human rights', 'black lives matter'],
                      'locals_residents': ['local', 'resident', 'peasant', 'people', 'civilian', 'villager', 'citizen'],
                      'prisoners': ['prisoner', 'prinsoners', 'inmates'],
#                      'Police': ['police'],
                      'victims_families': ['victim'],
                      'pensioners_retirees': ['pensioner', 'retiree', 'former', 'elderly'],
                      'protestors_generic': ['protester', 
                                               'protestor', 
                                               'demonstrator', 
                                               'opponent', 
                                               'opposition', 
                                               'activist', 
                                               'supporter', 
                                               'rights',
                                               'rioters']
                      }

protester_demand_dict = {'workers_unions': ['land farm issue',
                                                 'labor wage dispute',
                                                 'price increases, tax policy'],
                            'political_group': ['political behavior, process',
                                                'removal of politician'],
                            'civil_human_rights': ['social restrictions', 
                                                     'police brutality']
                           }

In [71]:
# Fill in nans as "unspecified" which is an existing value in protesteridentity:
mass[['protesteridentity']] = mass[['protesteridentity']].fillna(value='unspecified')

In [176]:
# Modified function using protester_cateogry_dict to classify groups:

def get_protest_group_category(protest_group_description):
    for key_category, value_list in protester_category_dict.items():
        for protest_group in value_list:
            if protest_group in protest_group_description:
                return key_category
    return 'Other/Unspecified' # Other indicates no matching category was found 

In [177]:
# Function using protester_demand_dict to classify protesterdemand1, for protesteridentities not classified in above function:

def get_protest_demand_category(protestdemand1):
    for key_category, value_list in protester_demand_dict.items():
        for protest_demand in value_list:
            if protest_demand in protestdemand1:
                return key_category
    return 'Other/Unspecified' # Other indicates no matching category was found 

#### Execute functions to create protestor categories based on dictionary

In [178]:
# Loop through protesteridentity and notes and apply get_protest_group_category function: 

X = list(zip(mass['protesteridentity'].astype('str'), mass['notes'].astype('str')))

protester_id_group_1 = []
for pid, note in X:
    if get_protest_group_category(pid) != 'Other/Unspecified':
        protester_id_group_1.append(get_protest_group_category(pid))
    else: 
        protester_id_group_1.append(get_protest_group_category(note))
    

In [179]:
# Create temporary protester id grouping column to deal with Other/Unspecified: 
mass['protester_id_type_1'] = protester_id_group_1

In [180]:
# Loop through protesterdemand1 and apply get_protest_demand_category function: 

Y = list(zip(mass['protesterdemand1'].astype('str'), mass['protester_id_type_1'].astype('str')))

protester_id_group = []
for demand, pid in Y:
    if pid != 'Other/Unspecified':
        protester_id_group.append(pid)
    else: 
        protester_id_group.append(get_protest_demand_category(demand))


In [181]:
# Confirm number of new protester ids match length of dataframe:
len(protester_id_group)

15239

In [182]:
# Create new "final" column for protester id grouping:
mass['protester_id_type'] = protester_id_group

In [183]:
# Check value counts for new category created above. 
mass['protester_id_type'].value_counts(dropna=False)

protestors_generic     3791
workers_unions         3578
political_group        2309
students_youth         2227
locals_residents       1470
ethnic_group            478
religious_group         375
women                   367
soldiers_veterans       290
civil_human_rights      120
pensioners_retirees     104
prisoners                98
victims_families         32
Name: protester_id_type, dtype: int64

In [184]:
# Drop the temp column now that we have final protester_id_type column:
mass.drop(columns=['protester_id_type_1'], inplace=True)

#### Export files for further cleansing and feature creation

In [193]:
# Export csv to data folder for rest of team to use:
mass.to_csv('./data/01_mass_EDA_protestor_category.csv')

In [203]:
# Export pickle to data folder for rest of team to use:
mass.to_pickle('./data/01_mass_EDA_protestor_category.pickle')