# Initial imports and declarations
Will be expanded later to include a bevy of imports for various processing, data exploration, and modelling.

In [1]:
#core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#post-modelling metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

In [2]:
df = pd.read_csv("protests.csv")

---
# Data Structures 
Includes function declarations, lists, dictionaries, etc. that are used later in the program.

In [3]:
#Response and demands were dummified due to how the data was entered.  The dummies were then combined into a single response or demand column that had a binary value.
response_drops = [
    '1_accomodation', '1_arrests', '1_beatings', '1_crowd dispersal', '1_ignore', '1_killings', '1_shootings',
    '2_accomodation', '2_arrests', '2_beatings', '2_crowd dispersal', '2_ignore', '2_killings', '2_shootings', 
    '3_accomodation', '3_arrests', '3_beatings', '3_crowd dispersal', '3_ignore', '3_killings', '3_shootings', 
    '4_accomodation', '4_arrests', '4_beatings', '4_crowd dispersal', '4_killings', '4_shootings', 
    '5_.', '5_accomodation', '5_arrests', '5_beatings', '5_crowd dispersal', '5_killings', '5_shootings', 
    '6_accomodation', '6_arrests', '6_beatings', '6_crowd dispersal', '6_killings', 
    '7_.', '7_accomodation', '7_arrests', '7_beatings', '7_killings'
]

demand_drops = [
    'demand1_labor wage dispute', 'demand1_land farm issue', 'demand1_police brutality', 'demand1_political behavior, process', 'demand1_price increases, tax policy', 'demand1_removal of politician', 'demand1_social restrictions', 
    'demand2_labor wage dispute', 'demand2_land farm issue', 'demand2_police brutality', 'demand2_political behavior, process', 'demand2_price increases, tax policy', 'demand2_removal of politician', 'demand2_social restrictions', 
    'demand3_labor wage dispute', 'demand3_land farm issue', 'demand3_police brutality', 'demand3_political behavior, process', 'demand3_price increases, tax policy', 'demand3_removal of politician', 'demand3_social restrictions', 
    'demand4_.', 'demand4_labor wage dispute', 'demand4_land farm issue', 'demand4_police brutality', 'demand4_political behavior, process', 'demand4_price increases, tax policy', 'demand4_removal of politician'
]

#After the length of the protest in days was determined, the particulars about when it started or ended were dropped.  
#The year of occurence was maintained in another column.
time_drops = ['startday', 'startmonth', 'startyear', 'endday', 'endmonth', 'endyear']

other_drops = [
    'id', #Not useful to prediction.
    'ccode', #Not useful to prediction.
    'protest', #All values are 1.  Is this dataset the subset of another?
    'protestnumber', # of protests per country might be useful but not in the context of incremental numbers that it's being given
    'location', #Not extremely useable given how it's already being broken by region.
    'participants_category', #Too many null values to be of great value.
]

#Demands & response are the aggregated columns for their respective types
demands = ['protesterdemand1', 'protesterdemand2', 'protesterdemand3', 'protesterdemand4']

response = ["stateresponse1", "stateresponse2", "stateresponse3", "stateresponse4", "stateresponse5", "stateresponse6", "stateresponse7"]

#These will be the new engineered target columns for state responses, on a binary value.
targets = ['y_accomodation', 'y_arrests', 'y_beatings', 'y_crowd dispersal', 'y_ignore', 'y_killings', 'y_shootings']

In [4]:
#These four functions are text analysis for the Participants feature so that it can be converted to numeric.  
#This covered around 99% of cases in the dataframe.
#First, the text parser immediately converts a number to numeric if the string matches expectations.
#Second, strip_chars tries to parse numeric input that has special characters in it.  It attempts to convert it to an integer but if that fails then it passes back a string.
#Third, if a number is hyphenated ("100-1000") then it parses those as two separate numbers and averages them.
#Finally, if it does all of the above and the entry can't be converted to numeric, it's converted to NaN and dropped.


def parse_texts(x):
    x = x.lower()
    
    if x == "dozens":
        return 50
    elif x == "hundreds":
        return 500
    elif x == "thousands":
        return 5000
    elif x == "tens of thousands":
        return 50000
    elif "hundreds of thousands" in x:
        return 250000
    elif "millions" in x:
        return 2000000
    elif "million" in x:
        return 1000000
    
    
    elif "about " in x:
        return x[6:]
    elif "more than " in x:
        return x[10:]
    
    
    elif "several" in x:
        if "dozen" in x:
            return 50
        elif "hundred" in x:
            return 500
        elif "thousand" in x:
            return 5000
    
    
    elif "hundreds" in x:
        return 500
    elif "thousands" in x:
        return 5000
    
    else:
        return x
    
    
def strip_chars(x):
    banned_chars = "+s><,"
    x = "".join([c for c in x if c not in banned_chars])
    
    try:
        x = int(x)
    finally:
        return x


    
def avg_hyphen(x):
    accepted_chars = "1234567890-"
    ind = 0

    x = "".join([c for c in x if c in accepted_chars])
    
    for i in range(len(x)):
        if x[i] == "-":
            ind = i
    
    lower = x[:ind]
    upper = x[ind+1:]
    
    if (lower == "") or (upper==""):
        return np.nan
    
    return (int(lower) + int(upper)) /2
    
    
    
def map_participants(x):
    while type(x) == str:
        x = parse_texts(x)
        if type(x) == str:
            x = strip_chars(x)
        if type(x) == str:
            x = avg_hyphen(x)
        if type(x) == str:
            x = np.nan
    return x

---
# Data Cleaning
Contains blocks of code for known cleaning problems derived from any previous data exploration.

In [5]:
#General/Miscellaneous Cleaning

df.dropna(subset=["notes"], inplace=True) #If there are no notes, then we will not be able to predict the outcome very well.
df.dropna(subset=["participants"], inplace=True) #Participants had very few NaN values
df.dropna(subset=["sources"], inplace=True) #Sources had very few NaN values


#Miscellaneous useless feature cleaning.  See the list declaration [other_drops] in DATA STRUCTURES for additional information.
df.drop(columns=other_drops, inplace=True)


#For the 500 or so values containing NaN in protestor identity:
df.fillna(value={"protesteridentity":"unspecified"}, inplace=True)

In [6]:
#For fixing the time values such that a length of time (in days) for the protest is established as a feature, and other time features are dropped.
#Critically, the year the protest initially occured is retained in another column.

month_days = {1:0, 2:31, 3:59, 4:90, 5:120, 6:151, 7:181, 8:212, 9:243, 10:273, 11:304, 12:334}
df["protest_length"] = 0

for i in range(len(df)):
    yearday_start = month_days[df["startmonth"].iloc[i]] + df["startday"].iloc[i]
    yearday_end = month_days[df["endmonth"].iloc[i]] + df["endday"].iloc[i]
    
    difference = (yearday_end - yearday_start) + (365 * (df["endyear"].iloc[i] - df["startyear"].iloc[i]))
    
    if difference != 0:
        df["protest_length"].iloc[i] = difference
    else:
        df["protest_length"].iloc[i] = 1 #accounts for same-day protests


#Now that the length is obtained, the additional time columns can be dropped.
df.drop(columns=time_drops, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [7]:
#For fixing the Participants feature such that we have a numerical value.
#For more information, see the function map_participants() in DATA STRUCTURES.
df["participants"] = df["participants"].map(map_participants)

df.dropna(subset=["participants"], inplace=True) #150 null values remain

In [8]:
#For translating the vertical state response & the protester demands values laterally.


df = pd.get_dummies(data=df, prefix=["1", "2", "3", "4", "5", "6", "7"], columns=response)
df = pd.get_dummies(data=df, prefix=["demand1", "demand2", "demand3", "demand4"], columns=demands)


#Combining the disparate dummies into unified response columns.  
#Unfortunately there was a certain amount of manual labor involved in this due to how finicky pandas is.
df["demand_labor_wage_dispute"] = df['demand1_labor wage dispute'] + df['demand2_labor wage dispute'] + df['demand3_labor wage dispute'] + df['demand4_labor wage dispute']
df["demand_land_farm_issue"] = df['demand1_land farm issue'] + df['demand2_land farm issue'] + df['demand3_land farm issue'] + df['demand4_land farm issue']
df["demand_police_brutality"] = df['demand1_police brutality'] + df['demand2_police brutality'] + df['demand3_police brutality'] + df['demand4_police brutality']
df["demand_political_behavior_or_process"] = df['demand1_political behavior, process'] + df['demand2_political behavior, process'] + df['demand3_political behavior, process'] + df['demand4_political behavior, process']
df["demand_price_hike_or_tax_policy"] = df['demand1_price increases, tax policy'] + df['demand2_price increases, tax policy'] + df['demand3_price increases, tax policy'] + df['demand4_price increases, tax policy']
df["demand_removal_of_politician"] = df['demand1_removal of politician'] + df['demand2_removal of politician'] + df['demand3_removal of politician'] + df['demand4_removal of politician']
df["demand_social_restrictions"] = df['demand1_social restrictions'] + df['demand2_social restrictions'] + df['demand3_social restrictions']

df["y_accomodation"] = df['1_accomodation'] + df['2_accomodation'] + df['3_accomodation'] + df['4_accomodation'] + df['5_accomodation'] + df['6_accomodation'] + df['7_accomodation']
df["y_arrests"] = df['1_arrests'] + df['2_arrests'] + df['3_arrests'] + df['4_arrests'] + df['5_arrests'] + df['6_arrests'] + df['7_arrests']
df["y_beatings"] = df['1_beatings'] + df['2_beatings'] + df['3_beatings'] + df['4_beatings'] + df['5_beatings'] + df['6_beatings'] + df['7_beatings']
df["y_crowd_dispersal"] = df['1_crowd dispersal'] + df['2_crowd dispersal'] + df['3_crowd dispersal'] + df['4_crowd dispersal'] + df['5_crowd dispersal'] + df['6_crowd dispersal']
df["y_ignore"] = df['1_ignore'] + df['2_ignore'] + df['3_ignore']
df["y_killings"] = df['1_killings'] + df['2_killings'] + df['3_killings'] + df['4_killings'] + df['5_killings'] + df['6_killings'] + df['7_killings']
df["y_shootings"] = df['1_shootings'] + df['2_shootings'] + df['3_shootings'] + df['4_shootings'] + df['5_shootings']



#Getting rid of the disparate dummies now that we have unified responses.
df.drop(columns=response_drops, inplace=True)
df.drop(columns=demand_drops, inplace=True)

In [9]:
#Finally, dropping Oceania due to the limited number of entries for that region.
df = df[df["region"] != "Oceania"]

---
# Data exploration & analysis
This dataframe was primarily used for modelling; therefore, this section in this dataframe was used mostly for activities like fetching columns names and the like.

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14264 entries, 0 to 16312
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   country                               14264 non-null  object 
 1   year                                  14264 non-null  int64  
 2   region                                14264 non-null  object 
 3   protesterviolence                     14264 non-null  float64
 4   participants                          14264 non-null  float64
 5   protesteridentity                     14264 non-null  object 
 6   sources                               14264 non-null  object 
 7   notes                                 14264 non-null  object 
 8   protest_length                        14264 non-null  float64
 9   demand_labor_wage_dispute             14264 non-null  uint8  
 10  demand_land_farm_issue                14264 non-null  uint8  
 11  demand_police_b

---
---
# Modelling

---
 An overview of the methodology below: 
Four models were gridsearched to try and find the best-working model for the disparate data.  Significant care was taken to construct appropriate functions and loops for the task at hand that were side effect free.  The four core gridsearch functions each return a dictionary that can be appended to a results dataframe, which was then analyzed for success.

---
---

In [11]:
#Unique values for regions and countries were fetched.  Features were gathered for use in the models.
possible_responses = ['y_accomodation', 'y_arrests', 'y_beatings', 'y_crowd_dispersal', 'y_ignore', 'y_killings', 'y_shootings']

region_list = dict(df["region"].value_counts()).keys()

country_list = dict(df["country"].value_counts()).keys()

all_features = ['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'notes']

num_features = ['year', 'protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions']

---
Some common notes on the four gridsearch functions:  FeatureUnion was used to unify both text and numeric data into a single outcome.  Each pipeline fed into the gridsearch contains a FeatureUnion, which contains two pipelines - one for numeric data & one for text data, which are preprocessed with StandardScaler and CountVectorizer respectively.  The outermost pipeline then passes the results of the first entry (FeatureUnion) into the model in question.  FunctionTransformers fetch the appropriate data for each interior pipeline.

The process was extremely computationally expensive because in addition to whatever gridsearch parameters we were running, we had a "built in" model multiplier of 49 - 7 responses and 7 regions.  Therefore, we didn't do as much gridsearching as we wanted.

Finally, the results of each model run for a given region and a given response is passed into a dictionary which is then returned.  A for loop running through all of these models appends the results to a dataframe.
___
---
### Logistic Regression

In [463]:
def logreg__protest_by_the_response(df, response):    
    X = df[all_features]
    y = df[response]
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    
    get_numeric_data = FunctionTransformer(lambda df: df[num_features], validate=False)
    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('log', LogisticRegression(max_iter=5000))
    ])
    
    params = {
        'log__penalty' : ['l2', 'l1'],
#        'log__C' : [0.001, 0.01, 0,1, 1, 5],
#        'features__text_features__cvec__max_df': [0.90, 0.95],
#        'features__text_features__cvec__max_features': [None, 1000, 3000, 5000],
        'log__solver' : ['liblinear']
    }
    

    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)
    gs.fit(X_train, y_train)
    
    #Grab the baseline for comparison
    baseline = max([df[response].mean(), 1-df[response].mean()])
    
    #declare the dictionary which is returned and then appended as a row in a results dataframe
    reg_response_dict = {
        "model_type": "logreg",
        "region": df["region"].iloc[0],
        "training_score": gs.score(X_train, y_train),
        "testing_score": gs.score(X_test, y_test),
        "baseline": baseline,
        "baseline_response": (response[2:] if df[response].mean() > 0.5 else f"no {response[2:]}"),
        "model_success": ("yes" if gs.score(X_test, y_test) > (0.05+baseline) else "no"),
        "best_params": gs.best_params_
    }
    
    return reg_response_dict

---
### Random Forest

In [13]:
def rf__protest_by_the_response(df, response):
    X = df[all_features]
    y = df[response]
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    
    get_numeric_data = FunctionTransformer(lambda df: df[num_features], validate=False)
    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('rf', RandomForestClassifier())
    ])
    
    params = {
#        'rf__ccp_alpha' : [0.001, 0.01, 0.1, 1, 5],
        'rf__n_estimators' : [100, 300],
        'rf__max_depth' : [None, 5, 10],
        'rf__min_samples_split' : [2, 4],
        'rf__min_samples_leaf' : [1, 3]
    }
    
    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)
    gs.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
            
    reg_response_dict = {
        "model_type": "rf",
        "region": df["region"].iloc[0],
        "training_score": gs.score(X_train, y_train),
        "testing_score": gs.score(X_test, y_test),
        "baseline": baseline,
        "baseline_response": (response[2:] if df[response].mean() > 0.5 else f"no {response[2:]}"),
        "model_success": ("yes" if gs.score(X_test, y_test) > (0.05+baseline) else "no"),
        "best_params": gs.best_params_
    }
    
    return reg_response_dict

---
### SVC

In [14]:
def svc__protest_by_the_response(df, response):
    X = df[all_features]
    y = df[response]
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    
    get_numeric_data = FunctionTransformer(lambda df: df[num_features], validate=False)
    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('svc', SVC())
    ])
    
    params = {
        'svc__C' : [5],
        'svc__degree' :[2],
    }
    
    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)
    gs.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
            
    reg_response_dict = {
        "model_type": "svc",
        "region": df["region"].iloc[0],
        "training_score": gs.score(X_train, y_train),
        "testing_score": gs.score(X_test, y_test),
        "baseline": baseline,
        "baseline_response": (response[2:] if df[response].mean() > 0.5 else f"no {response[2:]}"),
        "model_success": ("yes" if gs.score(X_test, y_test) > (0.05+baseline) else "no"),
        "best_params": gs.best_params_
    }
    
    return reg_response_dict

---
### XGB Classifier

In [15]:
def xgb__protest_by_the_response(df, response):
    X = df[all_features]
    y = df[response]
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    
    get_numeric_data = FunctionTransformer(lambda df: df[num_features], validate=False)
    get_text_data = FunctionTransformer(lambda df: df['notes'], validate=False)
    
    
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english'))
            ]))
         ])),
    ('xg', XGBClassifier())
    ])
    
    params = {
#        'xg__gamma' : [0.001, 0.01, 0.1, 1, 5],
        'xg__max_depth' :[None, 2, 3],
#        'xg__learning_rate' : [0.001, 0.01, 0.1, 1, 5]
    }
    
    gs = GridSearchCV(pipe, param_grid=params, cv=5, verbose=0)
    gs.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
            
    reg_response_dict = {
        "model_type": "xgb",
        "region": df["region"].iloc[0],
        "training_score": gs.score(X_train, y_train),
        "testing_score": gs.score(X_test, y_test),
        "baseline": baseline,
        "baseline_response": (response[2:] if df[response].mean() > 0.5 else f"no {response[2:]}"),
        "model_success": ("yes" if gs.score(X_test, y_test) > (0.05+baseline) else "no"),
        "best_params": gs.best_params_
    }
    
    return reg_response_dict

---
### Remaining functions

In [16]:
#Declare an empty dataframe for the results to be appended into.
grid_results = pd.DataFrame(columns=["model_type", "region", "baseline_response", "training_score", "testing_score", "baseline", "model_success", "best_params"])


#These four models correspond to each of the four model gridsearches.  It calls its corresponding model gridsearch function using a slice of the dataframe
#divided by the continental "regions".  It returns a list of 7 dictionaries - one for each of the seven possible state responses.
def logreg__responses_by_location(loc_df):
    return [logreg__protest_by_the_response(loc_df, response) for response in possible_responses]

def rf__responses_by_location(loc_df):
    return [rf__protest_by_the_response(loc_df, response) for response in possible_responses]

def svc__responses_by_location(loc_df):
    return [svc__protest_by_the_response(loc_df, response) for response in possible_responses]

def xgb__responses_by_location(loc_df):
    return [xgb__protest_by_the_response(loc_df, response) for response in possible_responses]

In [17]:
#This just returns a slice by the region of the dataframe overall.  Country ended up not being used.
def df_by_region(df, region):
    return df[df["region"]==region]

def df_by_country(df, country):
    return df[df["country"]==country]

---
---
# Running the models
---
---
For each of the 4 different gridsearch functions, a for-loop was run to get the results for each region. These for loops call the previous declared functions and append the returned dictionaries to the results dataframe.  In this way, we were able to troubleshoot each of the above functions without side effects - no restarting the kernel or messing up the dataframes already existing.  

Credit goes to Kovacs' wife, Samantha Baldwin, for some help with the Python aspects of creating the function structure for running the models side-effect free.

In [None]:
for region in region_list:
    for dct in logreg__responses_by_location(df_by_region(df, region)):
        grid_results = grid_results.append(dct, ignore_index=True)

In [None]:
for region in region_list:
    for dct in rf__responses_by_location(df_by_region(df, region)):
        grid_results = grid_results.append(dct, ignore_index=True)

In [None]:
for region in region_list:
    for dct in svc__responses_by_location(df_by_region(df, region)):
        grid_results = grid_results.append(dct, ignore_index=True)

In [None]:
for region in region_list:
    for dct in xgb__responses_by_location(df_by_region(df, region)):
        grid_results = grid_results.append(dct, ignore_index=True)

---

In [18]:
grid_results

Unnamed: 0,model_type,region,baseline_response,training_score,testing_score,baseline,model_success,best_params


In [19]:
#The grid results were then appended to a dataframe.  This notebook does not have the results of the cells output for the above gridsearches
#since the kernel was restarted at some point and these cells were too computationally expensive to re-run, but the results have all been preserved.
grid_results.to_csv("./data/four_models.csv")

---
---
## Engineering & Assessment
---
---
The four_models grid results were then analyzed to see which one worked best for additional fine-tuning.  The broader structure that has already been written was primarily a preliminary examination of the best working models since we didn't know what the results of using FeatureUnion would be.

In [464]:
grid_results = pd.read_csv("./data/four_models.csv")

In [465]:
grid_results["success_rate"] = grid_results["testing_score"] - grid_results["baseline"]

In [466]:
lg = grid_results[grid_results["model_type"] == "logreg"]["success_rate"].mean()
rf = grid_results[grid_results["model_type"] == "rf"]["success_rate"].mean()
svc = grid_results[grid_results["model_type"] == "svc"]["success_rate"].mean()
xgb = grid_results[grid_results["model_type"] == "xgb"]["success_rate"].mean()

In [467]:
print(f"Logistic Regression:\t{lg}")
print(f"Random Forest:\t\t{rf}")
print(f"SVC:\t\t\t{svc}")
print(f"XGB Classifier:\t\t{xgb}")

Logistic Regression:	0.04725500054879036
Random Forest:		0.051944471366851
SVC:			0.052335643254225385
XGB Classifier:		0.055978706244852044


---
---
# Additional Modelling

Also includes some additional cleaning, as portrayed below.
---
---
We found that the XGB model worked the best and further gridsearching and finetuning was performed on this model.

Cleaning:  Consolidating the possible protest outcomes into broader, more predictable categories was an essential step.  Due to the extremely high baselines of a number of the possible state responses to protests (most protests do NOT result in killings or shootings!), we combined some of the less likely outcomes into single features in order to predict the outcome of the protest better.

In [20]:
#These responses were folded into broader categories, so they were dropped.
new_drops = ['y_arrests', 'y_crowd_dispersal', 'y_beatings', 'y_killings', 'y_shootings']

#Adverse reaction includes arrests and/or crowd dispersal
df["y_adverse_reaction"] = df["y_arrests"] + df["y_crowd_dispersal"]
df["y_adverse_reaction"] = df["y_adverse_reaction"].map(lambda x: 1 if x>0 else 0)

#State violence includes any or all of the following:  beatings, killings, or shootings
df["y_state_violence"] = df["y_beatings"] + df["y_killings"] + df["y_shootings"]
df["y_state_violence"] = df["y_state_violence"].map(lambda x: 1 if x>0 else 0)

#Finally, we drop the outdated responses.
df.drop(columns=new_drops, inplace=True)

In [62]:
#When performing this cleaning, we found a problem with the original data:  some values of demands or responses were "doubled" - 
#an entry for a demand or a response was entered twice in the same row.  
#This means in the final dummified responses or demands, not all values are 0 or 1.  They might be a 2.  
#This code fixes for that by collapsing any number higher than one into a 1.

possible_responses = ['y_accomodation', 'y_ignore', 'y_adverse_reaction', 'y_state_violence']
possible_demands = ['demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions']

for r in possible_responses:
    df[r] = df[r].map(lambda x: 1 if x>0 else 0)
    
for d in possible_demands:
    df[d] = df[d].map(lambda x: 1 if x>0 else 0)

In [188]:
#North America was also dropped due to how the data was included - there were no values for the USA and only 47 for Canada.  
#The rest of the values in North America were for Mexico and other Caribbean nations, which we folded into Central America.

df = df[df["country"]!="Canada"]
df["region"] = df["region"].map(lambda x: "Central America" if x=="North America" else x)

In [191]:
#Combining the remaining text columns into a single text column so CountVectorizer can work on it.
df["notes"] = df["notes"] + " " + df["sources"] + " " + df["protesteridentity"]

In [194]:
#Setting the dataframe to have only years in the 2000s so that it's more contemporarily accurate.  We lose 4000 out of 10000 values by doing this.
df = df[df["year"]>=2000]

In [196]:
#redefining some values to reflex changes in modelling.
region_list = dict(df["region"].value_counts()).keys()

country_list = dict(df["country"].value_counts()).keys()

all_features = ['year', 'country', 'protesterviolence', 'participants', 
       'protest_length', 'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'notes']

num_features = ['protesterviolence', 'participants', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions']

#In the final models, we dummified year and country for each model run to try and eke up our accuracy.
dummy_features = ['year', 'country']

In [197]:
df.columns

Index(['country', 'year', 'region', 'protesterviolence', 'participants',
       'protesteridentity', 'sources', 'notes', 'protest_length',
       'demand_labor_wage_dispute', 'demand_land_farm_issue',
       'demand_police_brutality', 'demand_political_behavior_or_process',
       'demand_price_hike_or_tax_policy', 'demand_removal_of_politician',
       'demand_social_restrictions', 'y_accomodation', 'y_ignore',
       'y_adverse_reaction', 'y_state_violence'],
      dtype='object')

In [348]:
#This cell was run many times with gridsearches to try and fine-tune the model.  When the best combination of parameters was found, we removed the gridsearch
#and replaced it with the bare pipe.  The best results of xgb were also saved to a dataframe - much smaller than the 196 row frame we had before.
#We also had some continuous and un-abateable issues with overfitting.

def xgb__protest_by_the_response(df, response):
    df = pd.get_dummies(df, columns=["year", "country"], drop_first=True)
    
    #forbiddens correspond to features that do not go into the X vals but are still needed in the df
    forbiddens = ["region", "protesteridentity", "sources", "y_accomodation", "y_ignore", "y_adverse_reaction", "y_state_violence"]
    xgb_features = [i for i in df.columns if i not in forbiddens]
    
    #tweaking how num_features is declared so that it has everything numeric that is not in forbiddens.
    numerics = ['int64', 'float64', 'uint8']
    num_features = [i for i in df.select_dtypes(include=numerics).columns if i not in forbiddens]
    
    X = df[xgb_features]
    y = df[response]
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    #Getting the numeric and text data calls with our FunctionTransformers
    get_numeric_data = FunctionTransformer(lambda df: df[num_features], validate=False)
    get_text_data = FunctionTransformer(lambda df: df["notes"], validate=False)
    
    #declaring the pipeline with best parameters
    pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data),
                ('ss', StandardScaler())
            ])),
             ('text_features', Pipeline([
                ('selector', get_text_data),
                ('cvec', CountVectorizer(stop_words='english', max_df=0.8, max_features=2000))
            ]))
         ])),
    ('xg', XGBClassifier(max_depth=2, learning_rate=0.05))
    ])
    
    pipe.fit(X_train, y_train)

    baseline = max([df[response].mean(), 1-df[response].mean()])
            
    reg_response_dict = {
        "model_type": "xgb",
        "region": df["region"].iloc[0],
        "training_score": pipe.score(X_train, y_train),
        "testing_score": pipe.score(X_test, y_test),
        "baseline": baseline,
        "baseline_response": (response[2:] if df[response].mean() > 0.5 else f"no {response[2:]}"),
        "model_success": ("yes" if pipe.score(X_test, y_test) > (0.05+baseline) else "no"), #If the model is running more than 5 percentage points above the baseline in that category, it is noted as being more reliable than if not.
#        "best_params": gs.best_params_  #This entry is no longer needed since we don't have a gridsearch any more.
    }
    
    return reg_response_dict

In [349]:
#To make sure everything is still the same the bounding loop functions for the model function are declared or adjusted as needed.
def xgb__responses_by_location(loc_df):
    return [xgb__protest_by_the_response(loc_df, response) for response in possible_responses]

def df_by_region(df, region):
    return df[df["region"]==region]

In [350]:
#As before, we declare an empty results dataframe and append returned dictionaries to it, but this time with much better accuracy due to the reduction in dimensionality.

xgb_results = pd.DataFrame(columns=["model_type", "region", "baseline_response", "training_score", "testing_score", "baseline", "model_success"])


for region in region_list:
    for dct in xgb__responses_by_location(df_by_region(df, region)):
        xgb_results = xgb_results.append(dct, ignore_index=True)

In [351]:
xgb_results

Unnamed: 0,model_type,region,baseline_response,training_score,testing_score,baseline,model_success,best_params
0,xgb,Europe,no accomodation,0.943381,0.953808,0.9335,no,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
1,xgb,Europe,ignore,0.853039,0.868914,0.685607,yes,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
2,xgb,Europe,no adverse_reaction,0.899251,0.893883,0.727131,yes,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
3,xgb,Europe,no state_violence,0.976686,0.982522,0.97596,no,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
4,xgb,Africa,no accomodation,0.895176,0.9,0.864672,no,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
5,xgb,Africa,no ignore,0.834425,0.801786,0.590442,yes,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
6,xgb,Africa,no adverse_reaction,0.858845,0.869643,0.521661,yes,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
7,xgb,Africa,no state_violence,0.896367,0.891071,0.810183,yes,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
8,xgb,Asia,no accomodation,0.885246,0.879017,0.865248,no,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"
9,xgb,Asia,ignore,0.796343,0.793951,0.53948,yes,"{'xg__learning_rate': 0.05, 'xg__max_depth': 2}"


In [352]:
#Some additional information about how the XGB results performed against baseline and overall in this cell and the next.

xgb_results["success_rate"] = xgb_results["testing_score"] - xgb_results["baseline"]

xgb_total_success = xgb_results["success_rate"].mean()
xgb_low_success = xgb_results[xgb_results["baseline"]<0.85]["success_rate"].mean()

print(f"XGB Success Rate above baseline:\t\t\t\t{xgb_total_success}")
print(f"XGB Success Rate above baseline (for baselines under 85%):\t{xgb_low_success}")

XGB Success Rate above baseline:				0.11863320206114623
XGB Success Rate above baseline (for baselines under 85%):	0.1961123595634164


In [461]:
xgb_df = pd.read_csv("./data/xgb_results.csv")
print("training:  ", xgb_df["training_score"].mean())
print("testing:  ", xgb_df["testing_score"].mean())
print("baseline:  ", xgb_df["baseline"].mean())

training:   0.9631988697246057
testing:   0.8502163242241233
baseline:   0.7449533102118207
