# Mass Movement Data
In this file we are taking the mass movement data, making the necessary slices, creating necessary columns, and parsing out the necessary data needed to compare this data to the world happiness data

In [1]:
#import the dependents
import pandas as pd
import numpy as np

# Part 1
* reading in the data
* making the data slices
* generating the needed columns

## Read in the data

In [2]:
#load in the csv
mm_df = pd.read_csv('Mass_Movement_Data/mmALL_073119_csv.csv')
mm_df.head(3)

Unnamed: 0,id,country,ccode,year,region,protest,protestnumber,startday,startmonth,startyear,...,protesterdemand4,stateresponse1,stateresponse2,stateresponse3,stateresponse4,stateresponse5,stateresponse6,stateresponse7,sources,notes
0,201990001,Canada,20,1990,North America,1,1,15.0,1.0,1990.0,...,,ignore,,,,,,,1. Great Canadian train journeys into history;...,Canada s railway passenger system was finally...
1,201990002,Canada,20,1990,North America,1,2,25.0,6.0,1990.0,...,,ignore,,,,,,,1. Autonomy s Cry Revived in Quebec The New Yo...,protestors were only identified as young peop...
2,201990003,Canada,20,1990,North America,1,3,1.0,7.0,1990.0,...,,ignore,,,,,,,1. Quebec protest after Queen calls for unity ...,"THE Queen, after calling on Canadians to rema..."


In [3]:
#creating a slice for years less then 2019 and greater then 2015
years_df = mm_df.loc[(mm_df['startyear'] > 2015) & (mm_df['startyear'] < 2019)]

#finding the unique regions
years_df['region'].unique()

array(['North America', 'Central America', 'South America', 'Europe',
       'Africa', 'MENA', 'Asia', 'Oceania'], dtype=object)

In [4]:
#slicing the data down to only the regions that we are looking at
regions_df = years_df.loc[(years_df['region'] == 'South America') | (years_df['region'] == 'Europe') | 
                          (years_df['region'] == 'Asia')]

#storing the unique countries from each region
south_america  = regions_df.loc[regions_df['region'] == 'South America', 'country'].unique()
europe = regions_df.loc[regions_df['region'] == 'Europe', 'country'].unique()
asia  = regions_df.loc[regions_df['region'] == 'Asia', 'country'].unique()

#resetting the index
regions_df = regions_df.reset_index(drop = True)

#printing out the unique columns
regions_df.columns

Index(['id', 'country', 'ccode', 'year', 'region', 'protest', 'protestnumber',
       'startday', 'startmonth', 'startyear', 'endday', 'endmonth', 'endyear',
       'protesterviolence', 'location', 'participants_category',
       'participants', 'protesteridentity', 'protesterdemand1',
       'protesterdemand2', 'protesterdemand3', 'protesterdemand4',
       'stateresponse1', 'stateresponse2', 'stateresponse3', 'stateresponse4',
       'stateresponse5', 'stateresponse6', 'stateresponse7', 'sources',
       'notes'],
      dtype='object')

In [5]:
#creating a new data frame with only needed columns
clean_columns_df = regions_df.loc[:,['country', 'year', 'region', 'startday', 'startmonth', 'startyear', 
                               'endday', 'endmonth', 'endyear', 'location', 'participants_category', 
                               'participants', 'protesterdemand1', 'stateresponse1']]

#changing the data type to get rid of the trailing zero
#start
clean_columns_df['startday'] = clean_columns_df['startday'].astype(int)
clean_columns_df['startmonth'] = clean_columns_df['startmonth'].astype(int)
clean_columns_df['startyear'] = clean_columns_df['startyear'].astype(int)
#end
clean_columns_df['endday'] = clean_columns_df['endday'].astype(int)
clean_columns_df['endmonth'] = clean_columns_df['endmonth'].astype(int)
clean_columns_df['endyear'] = clean_columns_df['endyear'].astype(int)

#adding the start day, month and year together
clean_columns_df['start'] = (clean_columns_df['startyear'].astype(str) + '-' + clean_columns_df['startmonth'].astype(str) + '-' + clean_columns_df['startday'].astype(str))
clean_columns_df['end'] = (clean_columns_df['endyear'].astype(str) + '-' + clean_columns_df['endmonth'].astype(str) + '-' + clean_columns_df['endday'].astype(str))
clean_columns_df.head()

#getting the unique columns
clean_columns_df.columns

Index(['country', 'year', 'region', 'startday', 'startmonth', 'startyear',
       'endday', 'endmonth', 'endyear', 'location', 'participants_category',
       'participants', 'protesterdemand1', 'stateresponse1', 'start', 'end'],
      dtype='object')

In [6]:
#converting the date to a timestamp
clean_columns_df['start'] = [pd.Timestamp(x) for x in clean_columns_df['start']]
clean_columns_df['end'] = [pd.Timestamp(x) for x in clean_columns_df['end']]

#getting the duration of the protest
clean_columns_df['protest_duration'] = clean_columns_df['end'] - clean_columns_df['start']

#cleaning the columns again
clean_df = clean_columns_df.loc[:,['country','year',  'start', 'end','protest_duration' , 'region', 'location', 
                                   'participants_category', 'protesterdemand1', 'stateresponse1']]

In [7]:
countries = clean_df['country'].value_counts()
countries_df = pd.DataFrame(countries)
countries_df = countries_df.reset_index(0)
countries_df = countries_df.rename(columns={"index":"country", 
                                            "country":"total protests"})
merge_df = pd.merge(clean_df, countries_df, how = 'outer', on = 'country')
greater_then_five = merge_df.loc[merge_df['total protests'] >= 5]
greater_then_five.head()

Unnamed: 0,country,year,start,end,protest_duration,region,location,participants_category,protesterdemand1,stateresponse1,total protests
0,Colombia,2016,2016-10-05,2016-10-17,12 days,South America,"Plaza de Bol var & Casa de Nari o, Bogot",>10000,"political behavior, process",ignore,6
1,Colombia,2017,2017-01-22,2017-01-22,0 days,South America,Bogot,100-999,"political behavior, process",crowd dispersal,6
2,Colombia,2017,2017-04-01,2017-04-15,14 days,South America,Bogota,1000-1999,"political behavior, process",ignore,6
3,Colombia,2017,2017-10-05,2017-10-05,0 days,South America,Tumaco,50-99,land farm issue,crowd dispersal,6
4,Colombia,2018,2018-01-26,2018-01-26,0 days,South America,Bogota,2000-4999,social restrictions,crowd dispersal,6


In [8]:
#storing the unique countries from each region
south_america  = greater_then_five.loc[greater_then_five['region'] == 'South America', 'country'].unique()
europe = greater_then_five.loc[greater_then_five['region'] == 'Europe', 'country'].unique()
asia  = greater_then_five.loc[greater_then_five['region'] == 'Asia', 'country'].unique()

#print
print(south_america)
print(europe)
print(asia)

['Colombia' 'Venezuela' 'Peru' 'Brazil' 'Bolivia' 'Chile' 'Argentina']
['United Kingdom' 'Ireland' 'Belgium' 'France' 'Spain' 'Portugal'
 'Germany' 'Poland' 'Hungary' 'Czech Republic' 'Slovak Republic' 'Italy'
 'Albania' 'Kosovo' 'Serbia' 'Macedonia' 'Croatia' 'Bosnia' 'Greece'
 'Cyprus' 'Bulgaria' 'Moldova' 'Romania' 'Russia' 'Latvia' 'Belarus'
 'Armenia' 'Georgia' 'Finland' 'Sweden']
['Afghanistan' 'China' 'Taiwan' 'South Korea' 'India' 'Pakistan'
 'Bangladesh' 'Myanmar' 'Nepal' 'Thailand' 'Cambodia' 'Malaysia'
 'Philippines' 'Indonesia']


In [9]:
greater_then_five.to_csv('Mass_Movement_data/movement_data.csv', index = False)