# Import Data

In [None]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.graph_objects as go
import calendar

df = pd.read_csv('https://raw.githubusercontent.com/layadp/Red-Tagging-of-Lumad-Groups/main/code/Group16UpdatedDataset.csv')

# Feature Generation

In [None]:
df['Date posted'] = pd.to_datetime(df['Date posted'], infer_datetime_format=True)
df['Year'] = df['Date posted'].dt.year
df['Month'] = df['Date posted'].dt.month
df['Time'] = df['Date posted'].dt.time
df['Day'] = df['Date posted'].dt.strftime('%A')
df['Engagements'] = df['Likes'] + df['Replies'] + df['Retweets'] + df['Quote Tweets']

# Get monthly count of tweets
d = dict(enumerate(calendar.month_abbr))
df_m = df.groupby(['Year','Month']).size().reset_index(name='Count')
df_m['Month'] = df_m['Month'].map(d)
df_m['Period'] = pd.to_datetime(df_m['Year'].astype(str) + df_m['Month'], format='%Y%b')
df_m.head()

Unnamed: 0,Year,Month,Count,Period
0,2015,Oct,1,2015-10-01
1,2017,Jan,1,2017-01-01
2,2017,Mar,3,2017-03-01
3,2017,May,3,2017-05-01
4,2017,Jun,1,2017-06-01


# Linear Regression Model

## Martial Law in Mindanao (July 2017)

In [None]:
## Perform regression modeling
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

import statsmodels.api as sm

# Split dataset to before and after Martial Law
df_m1 = df_m[:3]
df_m2 = df_m[3:]
y_m = df_m['Count']

#--------------------------------------------------------------
# Before Martial Law
#--------------------------------------------------------------
# Convert datetime to int
x = pd.to_datetime(df_m1['Period']).astype(int) / 10**9  # Convert to seconds (UNIX epoch start)
x = x.values.reshape(-1, 1)

# Number of tweets for each date posted
y = df_m1['Count']

# Linear regression model
# Statistical approach
x_lms = sm.add_constant(x)
linear_model_stat = sm.OLS(y, x_lms)
lms_results = linear_model_stat.fit()
p_values = lms_results.pvalues[1:]

# Machine learning approach (no p-values)
linear_model = LinearRegression()
linear_model.fit(x_lms, y)
y_linear_pred1 = linear_model.predict(x_lms)

# Calculate R2 and RMSE for linear regression model
linear_r2 = r2_score(y, y_linear_pred1)
linear_rmse = np.sqrt(mean_squared_error(y, y_linear_pred1))

print("Model Evaluation")
print("\nLinear Regression (Before): RMSE=%.2f, R2=%.2f" % (linear_rmse, linear_r2))
for i, p_value in enumerate(p_values.index):
  print(f'P({p_value}): {p_values[i]}')

if any(p_values <= 0.05):
  print("There is a significant relationship between the predictor and the response\n")
else:
  print("There is no significant relationship between the predictor and the response\n")

#--------------------------------------------------------------
# After Martial Law
#--------------------------------------------------------------
# Convert datetime to int
x = pd.to_datetime(df_m2['Period']).astype(int) / 10**9  # Convert to seconds (UNIX epoch start)
x = x.values.reshape(-1, 1)

# Number of tweets for each date posted
y = df_m2['Count']

# Linear regression model
# Statistical approach
x_lms = sm.add_constant(x)
linear_model_stat = sm.OLS(y, x_lms)
lms_results = linear_model_stat.fit()
p_values = lms_results.pvalues[1:]

# Machine learning approach (no p-values)
linear_model = LinearRegression()
linear_model.fit(x_lms, y)
y_linear_pred2 = linear_model.predict(x_lms)

# Calculate R2 and RMSE for linear regression model
linear_r2 = r2_score(y, y_linear_pred2)
linear_rmse = np.sqrt(mean_squared_error(y, y_linear_pred2))

print("\nLinear Regression (After): RMSE=%.2f, R2=%.2f" % (linear_rmse, linear_r2))
for i, p_value in enumerate(p_values.index):
  print(f'P({p_value}): {p_values[i]}')

if any(p_values <= 0.05):
  print("There is a significant relationship between the predictor and the response\n")
else:
  print("There is no significant relationship between the predictor and the response\n")

# Plot the model
xtt = df_m['Period'].dt.strftime('%Y-%m')
xtt1 = df_m1['Period'].dt.strftime('%Y-%m')
xtt2 = df_m2['Period'].dt.strftime('%Y-%m')

scatter_actual = go.Scatter(x=xtt, y=y_m, mode='markers', name='Tweet Count', marker=dict(color='blue', opacity=0.3))

line_regression1 = go.Scatter(x=xtt1, y=y_linear_pred1, mode='lines', name='LR Before', line=dict(color='red', dash='dash'))
line_regression2 = go.Scatter(x=xtt2, y=y_linear_pred2, mode='lines', name='LR After', line=dict(color='orange', dash='dash'))

data = [scatter_actual, line_regression1, line_regression2]

layout = go.Layout(xaxis=dict(title='Year'),
                   yaxis=dict(title='Tweet Count'),
                   title='Linear Regression Model for Red-tagging of Lumad Groups (2017 Martial Law in Mindanao)',
                   showlegend=True,
                   height=600)

fig = go.Figure(data=data, layout=layout)
fig.show()

Model Evaluation

Linear Regression (Before): RMSE=0.76, R2=0.35
P(x1): 0.6001844692071387
There is no significant relationship between the predictor and the response


Linear Regression (After): RMSE=4.63, R2=0.04
P(x1): 0.1746733167291683
There is no significant relationship between the predictor and the response



## Establishment of NTF-ELCAC (December 2018)

In [None]:
## Perform regression modeling
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

import statsmodels.api as sm

# Split dataset to before and after NTF-ELCAC
df_m1 = df_m[:18]
df_m2 = df_m[18:]
y_m = df_m['Count']

#--------------------------------------------------------------
# Before NTF-ELCAC
#--------------------------------------------------------------
# Convert datetime to int
x = pd.to_datetime(df_m1['Period']).astype(int) / 10**9  # Convert to seconds (UNIX epoch start)
x = x.values.reshape(-1, 1)

# Number of tweets for each date posted
y = df_m1['Count']

# Linear regression model
# Statistical approach
x_lms = sm.add_constant(x)
linear_model_stat = sm.OLS(y, x_lms)
lms_results = linear_model_stat.fit()
p_values = lms_results.pvalues[1:]

# Machine learning approach (no p-values)
linear_model = LinearRegression()
linear_model.fit(x_lms, y)
y_linear_pred1 = linear_model.predict(x_lms)

# Calculate R2 and RMSE for linear regression model
linear_r2 = r2_score(y, y_linear_pred1)
linear_rmse = np.sqrt(mean_squared_error(y, y_linear_pred1))

print("Model Evaluation")
print("\nLinear Regression (Before): RMSE=%.2f, R2=%.2f" % (linear_rmse, linear_r2))
for i, p_value in enumerate(p_values.index):
  print(f'P({p_value}): {p_values[i]}')

if any(p_values <= 0.05):
  print("There is a significant relationship between the predictor and the response\n")
else:
  print("There is no significant relationship between the predictor and the response\n")

#--------------------------------------------------------------
# After NTF-ELCAC
#--------------------------------------------------------------
# Convert datetime to int
x = pd.to_datetime(df_m2['Period']).astype(int) / 10**9  # Convert to seconds (UNIX epoch start)
x = x.values.reshape(-1, 1)

# Number of tweets for each date posted
y = df_m2['Count']

# Linear regression model
# Statistical approach
x_lms = sm.add_constant(x)
linear_model_stat = sm.OLS(y, x_lms)
lms_results = linear_model_stat.fit()
p_values = lms_results.pvalues[1:]

# Machine learning approach (no p-values)
linear_model = LinearRegression()
linear_model.fit(x_lms, y)
y_linear_pred2 = linear_model.predict(x_lms)

# Calculate R2 and RMSE for linear regression model
linear_r2 = r2_score(y, y_linear_pred2)
linear_rmse = np.sqrt(mean_squared_error(y, y_linear_pred2))

print("\nLinear Regression (After): RMSE=%.2f, R2=%.2f" % (linear_rmse, linear_r2))
for i, p_value in enumerate(p_values.index):
  print(f'P({p_value}): {p_values[i]}')

if any(p_values <= 0.05):
  print("There is a significant relationship between the predictor and the response\n")
else:
  print("There is no significant relationship between the predictor and the response\n")

# Plot the model
xtt = df_m['Period'].dt.strftime('%Y-%m')
xtt1 = df_m1['Period'].dt.strftime('%Y-%m')
xtt2 = df_m2['Period'].dt.strftime('%Y-%m')

scatter_actual = go.Scatter(x=xtt, y=y_m, mode='markers', name='Tweet Count', marker=dict(color='blue', opacity=0.3))

line_regression1 = go.Scatter(x=xtt1, y=y_linear_pred1, mode='lines', name='LR Before', line=dict(color='red', dash='dash'))
line_regression2 = go.Scatter(x=xtt2, y=y_linear_pred2, mode='lines', name='LR After', line=dict(color='orange', dash='dash'))

data = [scatter_actual, line_regression1, line_regression2]

layout = go.Layout(xaxis=dict(title='Year'),
                   yaxis=dict(title='Tweet Count'),
                   title='Linear Regression Model for Red-tagging of Lumad Groups (Establishment of NTF-ELCAC)',
                   showlegend=True,
                   height=600)

fig = go.Figure(data=data, layout=layout)
fig.show()

Model Evaluation

Linear Regression (Before): RMSE=1.77, R2=0.10
P(x1): 0.20926059883763656
There is no significant relationship between the predictor and the response


Linear Regression (After): RMSE=5.33, R2=0.04
P(x1): 0.2248510430038335
There is no significant relationship between the predictor and the response



# Event Detection Model

In [None]:
import plotly.graph_objects as go
from scipy.signal import find_peaks

# Peak detection
peaks, _ = find_peaks(df_m['Count'],
                      height=3,  # height of peaks
                      width=1,       # width of peaks
                      threshold=0, # vertical distance to its neighboring samples
                      distance=1,   # minimal horizontal distance (>= 1) in samples between neighbouring peaks
                      prominence=5) # vertical distance between the peak and its lowest contour line

# Extract event timestamps
events = df_m.iloc[peaks]

# Plot peaks
fig = go.Figure()

# Original data
fig.add_trace(go.Scatter(
  x=df_m['Period'],
  y=df_m['Count'],
  hovertext=df_m['Period'].dt.strftime('%Y-%m'),
  mode='lines',
  name='Original Data'))

# Peaks
fig.add_trace(go.Scatter(
  x=events['Period'],
  y=events['Count'],
  hovertext=events['Period'].dt.strftime('%Y-%m'),
  mode='markers',
  name='Peaks',
  marker=dict(
    color='red',
    size=8,
    symbol='x')))

fig.update_layout(height=600,
                  title='Event Detection Model for Red-tagging of Lumad Groups',
                  xaxis_title='Year',
                  yaxis_title='Tweet Count')